bitkeeper revision 1.652.1.1 (3fe4441bD7Ytc0dpv4nkQCX5YO2A8w)

author kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>

Sat, 20 Dec 2003 12:44:11 +0000 (12:44 +0000)

committer kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>

Sat, 20 Dec 2003 12:44:11 +0000 (12:44 +0000)
author kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Sat, 20 Dec 2003 12:44:11 +0000 (12:44 +0000)
committer kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Sat, 20 Dec 2003 12:44:11 +0000 (12:44 +0000)
diff --git a/.rootkeys b/.rootkeys

index 8506a8fff808bdf124816eb5aabcd14761a2966e..a21a1a92f403f481c02fec68ef5ee84acff7142c 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
@@ -80,10 +80,8 @@
  3fbd4bd6GtGwZGxYUJPOheYIR7bPaA tools/xc/py/XenoUtil.py
  3fbd0a40yT6G3M9hMpaz5xTUdl0E4g tools/xc/py/setup.py
  3f72f1bdJPsV3JCnBqs9ddL9tr6D2g xen/COPYING
-3f841450eJvqAD1Dldc0_aOweGiglQ xen/GUEST_CHANGES
  3ddb79bcbOVHh38VJzc97-JEGD4dJQ xen/Makefile
  3ddb79bcWnTwYsQRWl_PaneJfa6p0w xen/Rules.mk
-3e74d2be6ELqhaY1sW0yyHRKhpOvDQ xen/TODO
  3ddb79bcZbRBzT3elFWSX7u6NtMagQ xen/arch/i386/Makefile
  3ddb79bcBQF85CfLS4i1WGZ4oLLaCA xen/arch/i386/Rules.mk
  3e5636e5FAYZ5_vQnmgwFJfSdmO5Mw xen/arch/i386/acpitable.c
@@ -93,6 +91,7 @@
  3ddb79bcUrk2EIaM5VsT6wUudH1kkg xen/arch/i386/delay.c
  3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/i386/entry.S
  3ddb79bcY5zW7KhvI9gvfuPi3ZumEg xen/arch/i386/extable.c
+3fe443fdDDb0Sw6NQBCk4GQapayfTA xen/arch/i386/flushtlb.c
  3ddb79bcesE5E-lS4QhRhlqXxqj9cA xen/arch/i386/i387.c
  3ddb79bcCAq6IpdkHueChoVTfXqEQQ xen/arch/i386/i8259.c
  3ddb79bcBit4xJXbwtX0kb1hh2uO1Q xen/arch/i386/idle0_task.c
diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c

index a0176edfc1aced5d513f5b5040827367744d9f26..7e5c57bb0d55c692ef6028b0bae6d7d2d2ad5ec2 100644 (file)
--- a/tools/xc/lib/xc_linux_build.c
+++ b/tools/xc/lib/xc_linux_build.c
@@ -106,12 +106,12 @@ static int setup_guestos(int xc_handle,
                           const char *cmdline,
                           unsigned long shared_info_frame)
  {
-    l1_pgentry_t *vl1tab = NULL, *vl1e = NULL;
-    l2_pgentry_t *vl2tab = NULL, *vl2e = NULL;
+    l1_pgentry_t *vl1tab;
+    l2_pgentry_t *vl2tab;
      unsigned long *page_array = NULL;
      mmu_update_t *pgt_update_arr = NULL, *pgt_updates = NULL;
      int alloc_index, num_pt_pages;
-    unsigned long l2tab;
+    unsigned long l2tab, l2e, l1e=0;
      unsigned long l1tab = 0;
      unsigned long num_pgt_updates = 0;
      unsigned long count, pt_start, i, j;
@@ -230,44 +230,46 @@ static int setup_guestos(int xc_handle,
      if ( (vl2tab = map_pfn(pm_handle, l2tab >> PAGE_SHIFT)) == NULL )
          goto error_out;
      memset(vl2tab, 0, PAGE_SIZE);
-    vl2e = vl2tab + l2_table_offset(virt_load_addr);
+    unmap_pfn(pm_handle, vl2tab);
+    l2e = l2tab + (l2_table_offset(virt_load_addr)*sizeof(l2_pgentry_t));
      for ( count = 0; count < tot_pages; count++ )
      {    
-        if ( ((unsigned long)vl1e & (PAGE_SIZE-1)) == 0 ) 
+        if ( (l1e & (PAGE_SIZE-1)) == 0 )
          {
              l1tab = page_array[alloc_index] << PAGE_SHIFT;
              if ( (vl1tab = map_pfn(pm_handle, l1tab >> PAGE_SHIFT)) == NULL )
                  goto error_out;
              memset(vl1tab, 0, PAGE_SIZE);
+            unmap_pfn(pm_handle, vl1tab);
              alloc_index--;
                 
-            vl1e = vl1tab + l1_table_offset(virt_load_addr + 
-                                            (count << PAGE_SHIFT));
+            l1e = l1tab + (l1_table_offset(virt_load_addr+(count<<PAGE_SHIFT))*
+                           sizeof(l1_pgentry_t));
  
              /* make apropriate entry in the page directory */
-            pgt_updates->ptr = (unsigned long)vl2e;
+            pgt_updates->ptr = l2e;
              pgt_updates->val = l1tab | L2_PROT;
              pgt_updates++;
              num_pgt_updates++;
-            vl2e++;
+            l2e += sizeof(l2_pgentry_t);
          }
  
          if ( count < pt_start )
          {
-            pgt_updates->ptr = (unsigned long)vl1e;
+            pgt_updates->ptr = l1e;
              pgt_updates->val = (page_array[count] << PAGE_SHIFT) | L1_PROT;
              pgt_updates++;
              num_pgt_updates++;
-            vl1e++;
+            l1e += sizeof(l1_pgentry_t);
          }
          else
          {
-            pgt_updates->ptr = (unsigned long)vl1e;
+            pgt_updates->ptr = l1e;
              pgt_updates->val = 
                  ((page_array[count] << PAGE_SHIFT) | L1_PROT) & ~_PAGE_RW;
              pgt_updates++;
              num_pgt_updates++;
-            vl1e++;
+            l1e += sizeof(l1_pgentry_t);
          }
  
          pgt_updates->ptr = 
diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c

index 2418d972195c80294f26fd8092cae6fb46fb73f2..44ebe3c9408a8c4feb18f9540021157bb1461df0 100644 (file)
--- a/tools/xc/lib/xc_linux_restore.c
+++ b/tools/xc/lib/xc_linux_restore.c
@@ -301,7 +301,8 @@ int xc_linux_restore(int xc_handle,
                      page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT;
                  }
                  if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
-                                    (unsigned long)&ppage[j], page[j]) )
+                                    (mfn<<PAGE_SHIFT)+(j*sizeof(l1_pgentry_t)),
+                                    page[j]) )
                      goto out;
              }
              break;
@@ -337,7 +338,8 @@ int xc_linux_restore(int xc_handle,
                      page[j] |= pfn_to_mfn_table[pfn] << PAGE_SHIFT;
                  }
                  if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
-                                    (unsigned long)&ppage[j], page[j]) )
+                                    (mfn<<PAGE_SHIFT)+(j*sizeof(l2_pgentry_t)),
+                                    page[j]) )
                      goto out;
              }
              break;
@@ -345,9 +347,6 @@ int xc_linux_restore(int xc_handle,
              memcpy(ppage, page, PAGE_SIZE);
              break;
          }
-        /* NB. Must flush before unmapping page, as pass VAs to Xen. */
-        if ( flush_mmu_updates(xc_handle, mmu_updates, &mmu_update_idx) )
-            goto out;
          unmap_pfn(pm_handle, ppage);
  
          if ( add_mmu_update(xc_handle, mmu_updates, &mmu_update_idx,
diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c

index 463efb7acb25dc3869b04ba7af2750b7b6912f26..e5f5934cfff306fb976e5269c6f7021ac206260b 100644 (file)
--- a/tools/xc/lib/xc_linux_save.c
+++ b/tools/xc/lib/xc_linux_save.c
@@ -44,19 +44,20 @@ static int check_pfn_ownership(int xc_handle,
  {
      dom0_op_t op;
      op.cmd = DOM0_GETPAGEFRAMEINFO;
-    op.u.getpageframeinfo.pfn = mfn;
-    if ( (do_dom0_op(xc_handle, &op) < 0) || 
-         (op.u.getpageframeinfo.domain != dom) )
-        return 0;
-    return 1;
+    op.u.getpageframeinfo.pfn    = mfn;
+    op.u.getpageframeinfo.domain = dom;
+    return (do_dom0_op(xc_handle, &op) >= 0);
  }
  
  #define GETPFN_ERR (~0U)
-static unsigned int get_pfn_type(int xc_handle, unsigned long mfn)
+static unsigned int get_pfn_type(int xc_handle, 
+                                 unsigned long mfn, 
+                                 unsigned int dom)
  {
      dom0_op_t op;
      op.cmd = DOM0_GETPAGEFRAMEINFO;
-    op.u.getpageframeinfo.pfn = mfn;
+    op.u.getpageframeinfo.pfn    = mfn;
+    op.u.getpageframeinfo.domain = dom;
      if ( do_dom0_op(xc_handle, &op) < 0 )
      {
          PERROR("Unexpected failure when getting page frame info!");
@@ -259,7 +260,8 @@ int xc_linux_save(int xc_handle,
          mfn_to_pfn_table[mfn] = i;
  
          /* Query page type by MFN, but store it by PFN. */
-        if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn)) == GETPFN_ERR )
+        if ( (pfn_type[i] = get_pfn_type(xc_handle, mfn, domid)) == 
+             GETPFN_ERR )
              goto out;
      }
  
diff --git a/xen/GUEST_CHANGES b/xen/GUEST_CHANGES

deleted file mode 100644 (file)

index b9f25d4..0000000
--- a/xen/GUEST_CHANGES
+++ /dev/null
@@ -1,26 +0,0 @@
-
-The interface between Xen and overlying guest OSes has changed in the
-following ways since version 1.0:
-
-Modified hypercall 'pt_update'
-------------------------------
-Page-table updates passed to the 'pt_update' hypercall must now
-specify a virtual address that maps the PTE to be modified. Previously
-a physical address was used, requiring Xen to temporarily map the PTE
-into its own private region so that it could be read and written.
-This affects only commands of type PGREQ_NORMAL_UPDATE and
-PGREQ_UNCHECKED_UPDATE.
-
-New hypercall 'update_va_mapping'
----------------------------------
-A new high-speed page-table update method has been introduced, which
-may be of particular benefit when fixing up application page faults.
-Invoked as 'update_va_mapping(page_number, new_pte_value, flags)':
- <page_number>: The virtual page number in the current address space 
-                whose PTE is to be modified.
- <new_pte_value>: The new value to write into the PTE.
- <flags>: An ORed combination of
-          UVMF_INVLPG: Flush stale TLB entry of the updated page mapping
-          UVMF_FLUSH_TLB: Flush all TLB entries
-You can see this new call in use in Xenolinux (common/memory.c).
-
diff --git a/xen/TODO b/xen/TODO

deleted file mode 100644 (file)

index 5eead81..0000000
--- a/xen/TODO
+++ /dev/null
@@ -1,54 +0,0 @@
-
-This is stuff we probably want to implement in the near future.
-
- -- Keir (16/3/03)
-
-
-1. DOMAIN-0 MANAGEMENT DAEMON
------------------------------
-A better control daemon is required for domain 0, which keeps proper
-track of machine resources and can make sensible policy choices. This
-may require support in Xen; for example, notifications (eg. DOMn is
-killed), and requests (eg. can DOMn allocate x frames of memory?).
-
-2. ASSIGNING DOMAINS TO PROCESSORS
-----------------------------------
-More intelligent assignment of domains to processors. In
-particular, we don't play well with hyperthreading: we will assign
-domains to virtual processors on the same package, rather then
-spreading them across processor packages.
-
-What we need to do is port code from Linux which stores information on
-relationships between processors in the system (eg. which ones are
-siblings in the same package). We then use this to balance domains
-across packages, and across virtual processors within a package.
-
-3. SANE NETWORK ROUTING
------------------------
-The current virtual firewall/router is completely broken. Needs a new
-design and implementation!
-
-
-
-Graveyard
-*********
-
-The hypervisor page cache
--------------------------
-This will allow guest OSes to make use of spare pages in the system, but
-allow them to be immediately used for any new domains or memory requests.
-The idea is that, when a page is laundered and falls off Linux's clean_LRU
-list, rather than freeing it it becomes a candidate for passing down into
-the hypervisor. In return, xeno-linux may ask for one of its previously-
-cached pages back:
- (page, new_id) = cache_query(page, old_id);
-If the requested page couldn't be kept, a blank page is returned.
-When would Linux make the query? Whenever it wants a page back without
-the delay or going to disc. Also, whenever a page would otherwise be
-flushed to disc.
-
-To try and add to the cache: (blank_page, new_id) = cache_query(page, NULL);
- [NULL means "give me a blank page"].
-To try and retrieve from the cache: (page, new_id) = cache_query(x_page, id)
- [we may request that x_page just be discarded, and therefore not impinge
-  on this domain's cache quota].
diff --git a/xen/arch/i386/Rules.mk b/xen/arch/i386/Rules.mk

index e137a1abd396d5e01f76b7fc9ebd5f05ffaf6ed0..4d00a727ecf6683dc8d183603cbe4cfc6e038a5c 100644 (file)
--- a/xen/arch/i386/Rules.mk
+++ b/xen/arch/i386/Rules.mk
@@ -8,8 +8,8 @@ MONITOR_BASE := 0xFC500000
  # Bootloader should load monitor to this real address
  LOAD_BASE    := 0x00100000
  CFLAGS  := -nostdinc -fno-builtin -O3 -Wall -DMONITOR_BASE=$(MONITOR_BASE) 
-CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG
-#CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__
+#CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__ -DNDEBUG
+CFLAGS  += -fomit-frame-pointer -I$(BASEDIR)/include -D__KERNEL__
  LDFLAGS := -T xeno.lds -N
  
  
diff --git a/xen/arch/i386/apic.c b/xen/arch/i386/apic.c

index 8a3a6b5cf8bf937467ffed1084b4752ec5fa8444..b3cd649c9cdfa2c9388db0885934b6337adc2e6e 100644 (file)
--- a/xen/arch/i386/apic.c
+++ b/xen/arch/i386/apic.c
@@ -47,7 +47,7 @@
  #include <asm/hardirq.h>
  #include <asm/apic.h>
  #include <xeno/mm.h>
-
+#include <asm/io_apic.h>
  #include <asm/timex.h>
  #include <xeno/ac_timer.h>
  #include <xeno/perfc.h>
diff --git a/xen/arch/i386/entry.S b/xen/arch/i386/entry.S

index e06c565de715f87ced5994f8bccf2a0fc6275e49..dc55e35041f7d53296b6aa4e22ba4a0138ff779a 100644 (file)
--- a/xen/arch/i386/entry.S
+++ b/xen/arch/i386/entry.S
@@ -82,7 +82,6 @@
  #include <xeno/config.h>
  #include <xeno/errno.h>
  #include <hypervisor-ifs/hypervisor-if.h>
-#include <asm/smp.h>
  
  EBX            = 0x00
  ECX            = 0x04
diff --git a/xen/arch/i386/flushtlb.c b/xen/arch/i386/flushtlb.c

new file mode 100644 (file)

index 0000000..fc543eb
--- /dev/null
+++ b/xen/arch/i386/flushtlb.c
@@ -0,0 +1,64 @@
+/******************************************************************************
+ * flushtlb.c
+ * 
+ * TLB flushes are timestamped using a global virtual 'clock' which ticks
+ * on any TLB flush on any processor.
+ * 
+ * Copyright (c) 2003, K A Fraser
+ */
+
+#include <xeno/config.h>
+#include <xeno/sched.h>
+#include <asm/flushtlb.h>
+
+unsigned long tlbflush_mask;
+unsigned long tlbflush_clock;
+unsigned long tlbflush_time[NR_CPUS];
+
+static inline void tlb_clocktick(unsigned int cpu)
+{
+    unsigned long x, nx, y, ny;
+    
+    clear_bit(cpu, &tlbflush_mask);
+
+    /* Tick the clock. 'y' contains the current time after the tick. */
+    ny = tlbflush_clock;
+    do {
+#ifdef CONFIG_SMP
+        if ( unlikely(((y = ny+1) & (GLOBAL_FLUSH_PERIOD - 1)) == 0) )
+        {
+            new_tlbflush_clock_period();
+            y = tlbflush_clock;
+            break;
+        }
+#else
+        y = ny+1;
+#endif
+    }
+    while ( unlikely((ny = cmpxchg(&tlbflush_clock, y-1, y)) != y-1) );
+
+    /* Update cpu's timestamp to current time, unless someone else beats us. */
+    nx = tlbflush_time[cpu];
+    do { 
+        if ( unlikely((x = nx) >= y) )
+            break;
+    }
+    while ( unlikely((nx = cmpxchg(&tlbflush_time[cpu], x, y)) != x) );
+}
+
+void write_cr3_counted(unsigned long pa)
+{
+    __asm__ __volatile__ ( 
+        "movl %0, %%cr3"
+        : : "r" (pa) : "memory" );
+    tlb_clocktick(smp_processor_id());
+}
+
+void flush_tlb_counted(void)
+{
+    __asm__ __volatile__ ( 
+        "movl %%cr3, %%eax; movl %%eax, %%cr3"
+        : : : "memory", "eax" );
+    tlb_clocktick(smp_processor_id());
+}
+
diff --git a/xen/arch/i386/io_apic.c b/xen/arch/i386/io_apic.c

index 951763a0537fc8990974b2973ef1d22bf509d64d..7369966dd847969cec78152213152ed3423840a9 100644 (file)
--- a/xen/arch/i386/io_apic.c
+++ b/xen/arch/i386/io_apic.c
@@ -28,6 +28,8 @@
  #include <xeno/config.h>
  #include <asm/mc146818rtc.h>
  #include <asm/io.h>
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
  #include <asm/smp.h>
  #include <asm/desc.h>
  #include <asm/smpboot.h>
diff --git a/xen/arch/i386/ioremap.c b/xen/arch/i386/ioremap.c

index 06c09f85201e2cdd4e378d7f5b63bd35ec782631..c650d0b5d8f9010087ec0f0c239cb2514519fa2d 100644 (file)
--- a/xen/arch/i386/ioremap.c
+++ b/xen/arch/i386/ioremap.c
@@ -15,92 +15,50 @@
  #include <asm/pgalloc.h>
  #include <asm/page.h>
  
-static unsigned long remap_base = 0;
+static unsigned long remap_base = IOREMAP_VIRT_START;
  
  #define PAGE_ALIGN(addr)    (((addr)+PAGE_SIZE-1)&PAGE_MASK)
  
-static void new_l2e(l2_pgentry_t *pl2e)
-{
-    l1_pgentry_t *pl1e = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
-    if ( !pl1e ) BUG();
-    clear_page(pl1e);
-    *pl2e = mk_l2_pgentry(__pa(pl1e)|__PAGE_HYPERVISOR);
-}
-
-
-void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags)
+void * __ioremap(unsigned long phys_addr, 
+                 unsigned long size, 
+                 unsigned long flags)
  {
      unsigned long vaddr;
      unsigned long offset, cur=0, last_addr;
      l2_pgentry_t *pl2e;
      l1_pgentry_t *pl1e;
  
-    /* First time through, start allocating from far end of virtual memory. */
-    if ( !remap_base ) remap_base = IOREMAP_VIRT_START;
-
      /* Don't allow wraparound or zero size */
      last_addr = phys_addr + size - 1;
-    if (!size || last_addr < phys_addr)
+    if ( (size == 0) || (last_addr < phys_addr) )
          return NULL;
  
-    /*
-     * Don't remap the low PCI/ISA area, it's always mapped..
-     */
-    if (phys_addr >= 0xA0000 && last_addr < 0x100000)
+    /* Don't remap the low PCI/ISA area: it's always mapped. */
+    if ( (phys_addr >= 0xA0000) && (last_addr < 0x100000) )
          return phys_to_virt(phys_addr);
  
-    if(remap_base + size > IOREMAP_VIRT_END-1) {
-      printk("ioremap: going past end of reserved space!\n");
-      return NULL;
-    }
-#if 0
-    /*
-     * Don't allow anybody to remap normal RAM that we're using..
-     */
-    if (phys_addr < virt_to_phys(high_memory)) {
-        char *t_addr, *t_end;
-        struct pfn_info *page;
-
-        t_addr = __va(phys_addr);
-        t_end = t_addr + (size - 1);
-          
-        for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++)
-            if(!PageReserved(page))
-                return NULL;
+    if ( (remap_base + size) > (IOREMAP_VIRT_END - 1) )
+    {
+        printk("ioremap: going past end of reserved space!\n");
+        return NULL;
      }
-#endif
  
-    /*
-     * Mappings have to be page-aligned
-     */
+    /* Mappings have to be page-aligned. */
      offset = phys_addr & ~PAGE_MASK;
      phys_addr &= PAGE_MASK;
      size = PAGE_ALIGN(last_addr) - phys_addr;
  
-    /*
-     * Ok, go for it..
-     */
+    /* Ok, go for it. */
      vaddr = remap_base;
      remap_base += size;
      pl2e = &idle_pg_table[l2_table_offset(vaddr)];
-    if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e);
      pl1e = l2_pgentry_to_l1(*pl2e++) + l1_table_offset(vaddr);
-    for ( ; ; ) 
-    {
-        if ( !l1_pgentry_empty(*pl1e) ) BUG();
+    do {
          *pl1e++ = mk_l1_pgentry((phys_addr+cur)|PAGE_HYPERVISOR|flags);
-        cur += PAGE_SIZE;
-        if ( cur == size ) break;
-        if ( !((unsigned long)pl1e & (PAGE_SIZE-1)) )
-        {
-            if ( l2_pgentry_empty(*pl2e) ) new_l2e(pl2e);
-            pl1e = l2_pgentry_to_l1(*pl2e++);        
-        }
      }
+    while ( (cur += PAGE_SIZE) != size );
  
-    flush_tlb_all();
-
-    return (void *) (offset + (char *)vaddr);
+    return (void *)(offset + (char *)vaddr);
  }
  
  void iounmap(void *addr)
diff --git a/xen/arch/i386/irq.c b/xen/arch/i386/irq.c

index 2793eba3d7fef19d1009d5db37469c1460eac0b7..cd1bcc6b3c1b584e64d2b2063d5744395cd81f64 100644 (file)
--- a/xen/arch/i386/irq.c
+++ b/xen/arch/i386/irq.c
@@ -24,7 +24,8 @@
  #include <xeno/interrupt.h>
  #include <xeno/irq.h>
  #include <xeno/slab.h>
-
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
  #include <asm/msr.h>
  #include <asm/hardirq.h>
  #include <asm/ptrace.h>
diff --git a/xen/arch/i386/mm.c b/xen/arch/i386/mm.c

index 5df703de7ade7b331b11a81fbe39dde37f01ad8c..84ef14cf8fe5b722cb5bfcbc86ff886714569f68 100644 (file)
--- a/xen/arch/i386/mm.c
+++ b/xen/arch/i386/mm.c
@@ -27,8 +27,8 @@
  #include <asm/fixmap.h>
  #include <asm/domain_page.h>
  
-static inline void set_pte_phys (unsigned long vaddr,
-                                 l1_pgentry_t entry)
+static inline void set_pte_phys(unsigned long vaddr,
+                                l1_pgentry_t entry)
  {
      l2_pgentry_t *l2ent;
      l1_pgentry_t *l1ent;
@@ -41,20 +41,22 @@ static inline void set_pte_phys (unsigned long vaddr,
      __flush_tlb_one(vaddr);
  }
  
-void __set_fixmap (enum fixed_addresses idx, 
-                   l1_pgentry_t entry)
+
+void __set_fixmap(enum fixed_addresses idx, 
+                  l1_pgentry_t entry)
  {
      unsigned long address = __fix_to_virt(idx);
  
-    if (idx >= __end_of_fixed_addresses) {
+    if ( likely(idx < __end_of_fixed_addresses) )
+        set_pte_phys(address, entry);
+    else
          printk("Invalid __set_fixmap\n");
-        return;
-    }
-    set_pte_phys(address, entry);
  }
  
-static void __init fixrange_init (unsigned long start, 
-                                  unsigned long end, l2_pgentry_t *pg_base)
+
+static void __init fixrange_init(unsigned long start, 
+                                 unsigned long end, 
+                                 l2_pgentry_t *pg_base)
  {
      l2_pgentry_t *l2e;
      int i;
@@ -66,7 +68,8 @@ static void __init fixrange_init (unsigned long start,
  
      for ( ; (i < ENTRIES_PER_L2_PAGETABLE) && (vaddr != end); l2e++, i++ ) 
      {
-        if ( !l2_pgentry_empty(*l2e) ) continue;
+        if ( !l2_pgentry_empty(*l2e) )
+            continue;
          page = (unsigned long)get_free_page(GFP_KERNEL);
          clear_page(page);
          *l2e = mk_l2_pgentry(__pa(page) | __PAGE_HYPERVISOR);
@@ -79,11 +82,6 @@ void __init paging_init(void)
      unsigned long addr;
      void *ioremap_pt;
  
-    /* XXX initialised in boot.S */
-    /*if ( cpu_has_pge ) set_in_cr4(X86_CR4_PGE);*/
-    /*if ( cpu_has_pse ) set_in_cr4(X86_CR4_PSE);*/
-    /*if ( cpu_has_pae ) set_in_cr4(X86_CR4_PAE);*/
-
      /*
       * Fixed mappings, only the page table structure has to be
       * created - mappings will be set by set_fixmap():
@@ -115,12 +113,12 @@ void __init paging_init(void)
  
  }
  
-void __init zap_low_mappings (void)
+void __init zap_low_mappings(void)
  {
      int i;
      for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
          idle_pg_table[i] = mk_l2_pgentry(0);
-    flush_tlb_all();
+    flush_tlb_all_pge();
  }
  
  
@@ -212,86 +210,54 @@ long set_gdt(struct task_struct *p,
               unsigned int entries)
  {
      /* NB. There are 512 8-byte entries per GDT page. */
-    unsigned int i, j, nr_pages = (entries + 511) / 512;
-    unsigned long pfn, *gdt_page;
-    long ret = -EINVAL;
-    struct pfn_info *page;
+    int i, nr_pages = (entries + 511) / 512;
+    unsigned long pfn;
      struct desc_struct *vgdt;
  
-    spin_lock(&p->page_lock);
-
      /* Check the new GDT. */
      for ( i = 0; i < nr_pages; i++ )
      {
-        if ( frames[i] >= max_page ) 
-            goto out;
-        
-        page = frame_table + frames[i];
-        if ( (page->flags & PG_domain_mask) != p->domain )
-            goto out;
-
-        if ( (page->flags & PG_type_mask) != PGT_gdt_page )
-        {
-            if ( page_type_count(page) != 0 )
-                goto out;
-
-            /* Check all potential GDT entries in the page. */
-            gdt_page = map_domain_mem(frames[0] << PAGE_SHIFT);
-            for ( j = 0; j < 512; j++ )
-                if ( !check_descriptor(gdt_page[j*2], gdt_page[j*2+1]) )
-                    goto out;
-            unmap_domain_mem(gdt_page);
-        }
+        if ( unlikely(frames[i] >= max_page) ||
+             unlikely(!get_page_and_type(&frame_table[frames[i]], 
+                                         p, PGT_gdt_page)) )
+            goto fail;
      }
  
+    /* Copy reserved GDT entries to the new GDT. */
+    vgdt = map_domain_mem(frames[0] << PAGE_SHIFT);
+    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
+           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
+           NR_RESERVED_GDT_ENTRIES*8);
+    unmap_domain_mem(vgdt);
+
      /* Tear down the old GDT. */
      for ( i = 0; i < 16; i++ )
      {
-        pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i]);
+        if ( (pfn = l1_pgentry_to_pagenr(p->mm.perdomain_pt[i])) != 0 )
+            put_page_and_type(&frame_table[pfn]);
          p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
-        if ( pfn == 0 ) continue;
-        page = frame_table + pfn;
-        ASSERT((page->flags & PG_type_mask) == PGT_gdt_page);
-        ASSERT((page->flags & PG_domain_mask) == p->domain);
-        ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0));
-        put_page_type(page);
-        put_page_tot(page);
      }
  
      /* Install the new GDT. */
      for ( i = 0; i < nr_pages; i++ )
-    {
          p->mm.perdomain_pt[i] =
              mk_l1_pgentry((frames[i] << PAGE_SHIFT) | __PAGE_HYPERVISOR);
-        
-        page = frame_table + frames[i];
-        page->flags &= ~(PG_type_mask | PG_need_flush);
-        page->flags |= PGT_gdt_page;
-        get_page_type(page);
-        get_page_tot(page);
-    }
-
-    /* Copy reserved GDT entries to the new GDT. */
-    vgdt = map_domain_mem(frames[i] << PAGE_SHIFT);
-    memcpy(vgdt + FIRST_RESERVED_GDT_ENTRY, 
-           gdt_table + FIRST_RESERVED_GDT_ENTRY, 
-           NR_RESERVED_GDT_ENTRIES*8);
-    unmap_domain_mem(vgdt);
  
      SET_GDT_ADDRESS(p, GDT_VIRT_START);
      SET_GDT_ENTRIES(p, (entries*8)-1);
  
-    ret = 0; /* success */
+    return 0;
  
- out:
-    spin_unlock(&p->page_lock);
-    return ret;
+ fail:
+    while ( i-- > 0 )
+        put_page_and_type(&frame_table[frames[i]]);
+    return -EINVAL;
  }
  
  
  long do_set_gdt(unsigned long *frame_list, unsigned int entries)
  {
-    unsigned int nr_pages = (entries + 511) / 512;
+    int nr_pages = (entries + 511) / 512;
      unsigned long frames[16];
      long ret;
  
@@ -321,14 +287,12 @@ long do_update_descriptor(
      if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) )
          return -EINVAL;
  
-    spin_lock(&current->page_lock);
-
-    page = frame_table + pfn;
-    if ( (page->flags & PG_domain_mask) != current->domain )
+    page = &frame_table[pfn];
+    if ( unlikely(!get_page(page, current)) )
          goto out;
  
      /* Check if the given frame is in use in an unsafe context. */
-    switch ( (page->flags & PG_type_mask) )
+    switch ( page->type_and_flags & PGT_type_mask )
      {
      case PGT_gdt_page:
          /* Disallow updates of Xen-reserved descriptors in the current GDT. */
@@ -336,12 +300,17 @@ long do_update_descriptor(
               (((pa&(PAGE_SIZE-1))>>3) >= FIRST_RESERVED_GDT_ENTRY) &&
               (((pa&(PAGE_SIZE-1))>>3) <= LAST_RESERVED_GDT_ENTRY) )
              goto out;
+        if ( unlikely(!get_page_type(page, PGT_gdt_page)) )
+            goto out;
+        break;
      case PGT_ldt_page:
-    case PGT_writeable_page:
+        if ( unlikely(!get_page_type(page, PGT_ldt_page)) )
+            goto out;
          break;
      default:
-        if ( page_type_count(page) != 0 )
+        if ( unlikely(!get_page_type(page, PGT_writeable_page)) )
              goto out;
+        break;
      }
  
      /* All is good so make the update. */
@@ -350,9 +319,11 @@ long do_update_descriptor(
      gdt_pent[1] = word2;
      unmap_domain_mem(gdt_pent);
  
+    put_page_type(page);
+
      ret = 0; /* success */
  
   out:
-    spin_unlock(&current->page_lock);
+    put_page(page);
      return ret;
  }
diff --git a/xen/arch/i386/pci-irq.c b/xen/arch/i386/pci-irq.c

index b7a212b014f813fa653946a8758bd1d6fb5928e3..2c68d9d3b31ac8ddf5c3a693f8197029bfdb7abc 100644 (file)
--- a/xen/arch/i386/pci-irq.c
+++ b/xen/arch/i386/pci-irq.c
@@ -6,16 +6,15 @@
  
  #include <linux/config.h>
  #include <linux/types.h>
-/*#include <linux/kernel.h>*/
  #include <linux/pci.h>
  #include <linux/init.h>
  #include <linux/slab.h>
  #include <linux/interrupt.h>
  #include <linux/irq.h>
  #include <linux/sched.h>
-
  #include <asm/io.h>
  #include <asm/smp.h>
+#include <asm/mpspec.h>
  #include <asm/io_apic.h>
  
  #include "pci-i386.h"
diff --git a/xen/arch/i386/process.c b/xen/arch/i386/process.c

index 4f7d16d761f0528a22aad2b8b7750db01cd46027..e75ee1e05020ebb87ef022748d418d4be47cf5b4 100644 (file)
--- a/xen/arch/i386/process.c
+++ b/xen/arch/i386/process.c
@@ -27,6 +27,7 @@
  #include <asm/processor.h>
  #include <asm/desc.h>
  #include <asm/i387.h>
+#include <asm/mpspec.h>
  
  #include <xeno/irq.h>
  #include <xeno/event.h>
@@ -263,7 +264,7 @@ void switch_to(struct task_struct *prev_p, struct task_struct *next_p)
      tss->ss1  = next->ss1;
  
      /* Switch page tables.  */
-    __write_cr3_counted(pagetable_val(next_p->mm.pagetable));
+    write_cr3_counted(pagetable_val(next_p->mm.pagetable));
  
      set_current(next_p);
  
diff --git a/xen/arch/i386/smp.c b/xen/arch/i386/smp.c

index b1dfe64d4fa7fdad7ad08f9b3b005e6a12492a8a..4ec5176194a6dd616104e833554b599233d84645 100644 (file)
--- a/xen/arch/i386/smp.c
+++ b/xen/arch/i386/smp.c
@@ -16,6 +16,7 @@
  #include <asm/mc146818rtc.h>
  #include <asm/pgalloc.h>
  #include <asm/smpboot.h>
+#include <asm/hardirq.h>
  
  #ifdef CONFIG_SMP
  
@@ -264,34 +265,67 @@ static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
  asmlinkage void smp_invalidate_interrupt(void)
  {
      ack_APIC_irq();
-    if (test_and_clear_bit(smp_processor_id(), &flush_cpumask))
-        local_flush_tlb();
+    clear_bit(smp_processor_id(), &flush_cpumask);
+    local_flush_tlb();
  }
  
-void flush_tlb_others(unsigned long cpumask)
+void flush_tlb_mask(unsigned long mask)
  {
-    spin_lock(&tlbstate_lock);
-    atomic_set_mask(cpumask, &flush_cpumask);
-    send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
-    while (flush_cpumask) continue;
-    spin_unlock(&tlbstate_lock);
+    if ( unlikely(in_irq()) )
+        BUG();
+    
+    if ( mask & (1 << smp_processor_id()) )
+    {
+        local_flush_tlb();
+        mask &= ~(1 << smp_processor_id());
+    }
+
+    if ( mask != 0 )
+    {
+        spin_lock(&tlbstate_lock);
+        flush_cpumask = mask;
+        send_IPI_mask(mask, INVALIDATE_TLB_VECTOR);
+        while ( flush_cpumask != 0 )
+        {
+            rep_nop();
+            barrier();
+        }
+        spin_unlock(&tlbstate_lock);
+    }
  }
-       
-static inline void do_flush_tlb_all_local(void)
+
+void new_tlbflush_clock_period(void)
  {
-    __flush_tlb_all();
+    if ( unlikely(!spin_trylock(&tlbstate_lock)) )
+        return;
+
+    if ( unlikely((flush_cpumask = tlbflush_mask) != 0) )
+    {
+        send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR);
+        while ( flush_cpumask != 0 )
+        {
+            rep_nop();
+            barrier();
+        }
+    }
+
+    /* No need for cmpxchg updates here: we are protected by tlbstate lock. */
+    tlbflush_mask = (1 << smp_num_cpus) - 1;
+    wmb(); /* Reset the mask before allowing the clock to continue ticking. */
+    tlbflush_clock++;
+
+    spin_unlock(&tlbstate_lock);
  }
  
-static void flush_tlb_all_ipi(void* info)
+static void flush_tlb_all_pge_ipi(void* info)
  {
-    do_flush_tlb_all_local();
+    __flush_tlb_pge();
  }
  
-void flush_tlb_all(void)
+void flush_tlb_all_pge(void)
  {
-    smp_call_function (flush_tlb_all_ipi,0,1,1);
-
-    do_flush_tlb_all_local();
+    smp_call_function (flush_tlb_all_pge_ipi,0,1,1);
+    __flush_tlb_pge();
  }
  
  void smp_send_event_check_mask(unsigned long cpu_mask)
diff --git a/xen/arch/i386/smpboot.c b/xen/arch/i386/smpboot.c

index 506ec09cb92c6766cc87ecb583e647c6b4d35b42..b5a424900380a9bc55029b0e99039cd2c2d26abc 100644 (file)
--- a/xen/arch/i386/smpboot.c
+++ b/xen/arch/i386/smpboot.c
@@ -44,6 +44,8 @@
  #include <xeno/smp.h>
  #include <asm/msr.h>
  #include <asm/system.h>
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
  #include <xeno/sched.h>
  #include <xeno/delay.h>
  #include <xeno/lib.h>
diff --git a/xen/arch/i386/traps.c b/xen/arch/i386/traps.c

index 330defe3a8b8c0c23ae9796b1f56cfcf29cc9c29..78c26c37ccee90977081a6dffd1113e0cb6abcb0 100644 (file)
--- a/xen/arch/i386/traps.c
+++ b/xen/arch/i386/traps.c
@@ -211,6 +211,7 @@ static inline void do_trap(int trapnr, char *str,
  
      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
      {
+        DPRINTK("Trap %d: %08lx -> %08lx\n", trapnr, regs->eip, fixup);
          regs->eip = fixup;
          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
          return;
@@ -328,6 +329,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, long error_code)
  
      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
      {
+        DPRINTK("Page fault: %08lx -> %08lx\n", regs->eip, fixup);
          regs->eip = fixup;
          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
          return;
@@ -411,6 +413,7 @@ asmlinkage void do_general_protection(struct pt_regs *regs, long error_code)
  
      if ( likely((fixup = search_exception_table(regs->eip)) != 0) )
      {
+        DPRINTK("GPF (%04lx): %08lx -> %08lx\n", error_code, regs->eip, fixup);
          regs->eip = fixup;
          regs->xds = regs->xes = regs->xfs = regs->xgs = __HYPERVISOR_DS;
          return;
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c

index 2f3073a1c4acda2309b8b22be1cf5a734a849130..5b24d7b5c98f45913ac5ca7cf6d33b2665936991 100644 (file)
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -38,31 +38,6 @@ static unsigned int get_domnr(void)
      return 0;
  }
  
-static void build_page_list(struct task_struct *p)
-{
-    unsigned long *list;
-    unsigned long curr;
-    struct list_head *list_ent;
-
-    curr = list_entry(p->pg_head.next, struct pfn_info, list) - frame_table;
-    list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT);
-
-    list_for_each(list_ent, &p->pg_head)
-    {
-        *list++ = list_entry(list_ent, struct pfn_info, list) - frame_table;
-
-        if( ((unsigned long)list & ~PAGE_MASK) == 0 )
-        {
-            struct list_head *ent = frame_table[curr].list.next;
-            curr = list_entry(ent, struct pfn_info, list) - frame_table;
-            unmap_domain_mem(list-1);
-            list = (unsigned long *)map_domain_mem(curr << PAGE_SHIFT);
-        }
-    }
-
-    unmap_domain_mem(list);
-}
-
  static int msr_cpu_mask;
  static unsigned long msr_addr;
  static unsigned long msr_lo;
@@ -164,8 +139,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
              goto exit_create;
          }
  
-        build_page_list(p);
-        
          ret = p->domain;
          
          op.u.createdomain.domain = ret;
@@ -245,7 +218,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
      case DOM0_GETMEMLIST:
      {
          int i;
-        struct task_struct * p = find_domain_by_id(op.u.getmemlist.domain);
+        struct task_struct *p = find_domain_by_id(op.u.getmemlist.domain);
          unsigned long max_pfns = op.u.getmemlist.max_pfns;
          unsigned long pfn;
          unsigned long *buffer = op.u.getmemlist.buffer;
@@ -254,28 +227,27 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
          ret = -EINVAL;
          if ( p != NULL )
          {
-            list_ent = p->pg_head.next;
-            pfn = list_entry(list_ent, struct pfn_info, list) - frame_table;
-            
-            for ( i = 0; (i < max_pfns) && (list_ent != &p->pg_head); i++ )
+            ret = 0;
+
+            spin_lock(&p->page_list_lock);
+            list_ent = p->page_list.next;
+            for ( i = 0; (i < max_pfns) && (list_ent != &p->page_list); i++ )
              {
+                pfn = list_entry(list_ent, struct pfn_info, list) - 
+                    frame_table;
                  if ( put_user(pfn, buffer) )
                  {
                      ret = -EFAULT;
-                    goto out_getmemlist;
+                    break;
                  }
                  buffer++;
                  list_ent = frame_table[pfn].list.next;
-                pfn = list_entry(list_ent, struct pfn_info, list) - 
-                    frame_table;
              }
+            spin_unlock(&p->page_list_lock);
  
              op.u.getmemlist.num_pfns = i;
              copy_to_user(u_dom0_op, &op, sizeof(op));
-
-            ret = 0;
-
-        out_getmemlist:
+            
              put_task_struct(p);
          }
      }
@@ -368,21 +340,24 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
      {
          struct pfn_info *page;
          unsigned long pfn = op.u.getpageframeinfo.pfn;
-        
-        if ( pfn >= max_page )
-        {
-            ret = -EINVAL;
-        }
-        else
+        unsigned int dom = op.u.getpageframeinfo.domain;
+        struct task_struct *p;
+
+        ret = -EINVAL;
+
+        if ( unlikely(pfn >= max_page) || 
+             unlikely((p = find_domain_by_id(dom)) == NULL) )
+            break;
+
+        page = &frame_table[pfn];
+
+        if ( likely(get_page(page, p)) )
          {
-            page = frame_table + pfn;
-            
-            op.u.getpageframeinfo.domain = page->flags & PG_domain_mask;
-            op.u.getpageframeinfo.type   = NONE;
+            op.u.getpageframeinfo.type = NONE;
  
-            if ( page_type_count(page) != 0 )
+            if ( (page->type_and_flags & PGT_count_mask) != 0 )
              {
-                switch ( page->flags & PG_type_mask )
+                switch ( page->type_and_flags & PGT_type_mask )
                  {
                  case PGT_l1_page_table:
                      op.u.getpageframeinfo.type = L1TAB;
@@ -392,9 +367,13 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
                      break;
                  }
              }
-
-            copy_to_user(u_dom0_op, &op, sizeof(op));
+            
+            put_page(page);
          }
+
+        put_task_struct(p);
+
+        copy_to_user(u_dom0_op, &op, sizeof(op));
      }
      break;
  
diff --git a/xen/common/dom_mem_ops.c b/xen/common/dom_mem_ops.c

index c8869882aef86ba8f8e71151b10c4d4e5ce5d5e7..53facf8477ee5080198076ed2c59c5b69c7da3da 100644 (file)
--- a/xen/common/dom_mem_ops.c
+++ b/xen/common/dom_mem_ops.c
@@ -16,58 +16,26 @@
  #include <xeno/event.h>
  #include <asm/domain_page.h>
  
-#if 0
-#define DPRINTK(_f, _a...) printk( _f , ## _a )
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
  static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op)
  {
-    struct list_head *temp;
-    struct pfn_info  *pf;     /* pfn_info of current page */
+    struct pfn_info  *page;
      unsigned long     mpfn;   /* machine frame number of current page */
      void             *va;     /* Xen-usable mapping of current page */
      unsigned long     i;
-    unsigned long     flags;
-
-    /*
-     * POLICY DECISION: Each domain has a page limit.
-     * NB. The first part of test is because op.size could be so big that
-     * tot_pages + op.size overflows a u_long.
-     */
-    if( (op.size > p->max_pages) ||
-        ((p->tot_pages + op.size) > p->max_pages) )
-        return -ENOMEM;
-
-    spin_lock_irqsave(&free_list_lock, flags);
-
-    if ( free_pfns < (op.size + (SLACK_DOMAIN_MEM_KILOBYTES >> 
-                                  (PAGE_SHIFT-10))) ) 
-    {
-        spin_unlock_irqrestore(&free_list_lock, flags);
-        return -ENOMEM;
-    }
  
-    spin_lock(&p->page_lock);
-    
-    temp = free_list.next;
      for ( i = 0; i < op.size; i++ )
      {
-        /* Get a free page and add it to the domain's page list. */
-        pf = list_entry(temp, struct pfn_info, list);
-        pf->flags |= p->domain;
-        set_page_type_count(pf, 0);
-        set_page_tot_count(pf, 0);
-        temp = temp->next;
-        list_del(&pf->list);
-        list_add_tail(&pf->list, &p->pg_head);
-        free_pfns--;
-
-        p->tot_pages++;
-
+        /* Leave some slack pages; e.g., for the network. */
+        if ( unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
+                                   (PAGE_SHIFT-10))) ) 
+            break;
+
+        /* NB. 'alloc_domain_page' does limit checking on pages per domain. */
+        if ( unlikely((page = alloc_domain_page(p)) == NULL) )
+            break;
+        
          /* Inform the domain of the new page's machine address. */ 
-        mpfn = (unsigned long)(pf - frame_table);
+        mpfn = (unsigned long)(page - frame_table);
          copy_to_user(op.pages, &mpfn, sizeof(mpfn));
          op.pages++; 
  
@@ -77,26 +45,17 @@ static long alloc_dom_mem(struct task_struct *p, reservation_increase_t op)
          unmap_domain_mem(va);
      }
  
-    spin_unlock(&p->page_lock);
-    spin_unlock_irqrestore(&free_list_lock, flags);
-    
-    return op.size;
+    return i;
  }
      
  static long free_dom_mem(struct task_struct *p, reservation_decrease_t op)
  {
-    struct list_head *temp;
-    struct pfn_info  *pf;     /* pfn_info of current page */
+    struct pfn_info  *page;
      unsigned long     mpfn;   /* machine frame number of current page */
      unsigned long     i;
-    unsigned long     flags;
      long              rc = 0;
      int               need_flush = 0;
  
-    spin_lock_irqsave(&free_list_lock, flags);
-    spin_lock(&p->page_lock);
-
-    temp = free_list.next;
      for ( i = 0; i < op.size; i++ )
      {
          copy_from_user(&mpfn, op.pages, sizeof(mpfn));
@@ -109,37 +68,28 @@ static long free_dom_mem(struct task_struct *p, reservation_decrease_t op)
              goto out;
          }
  
-        pf = &frame_table[mpfn];
-        if ( (page_type_count(pf) != 0) || 
-             (page_tot_count(pf) != 0) ||
-             ((pf->flags & PG_domain_mask) != p->domain) )
+        page = &frame_table[mpfn];
+        if ( unlikely(!get_page(page, p)) )
          {
-            DPRINTK("Bad page free for domain %d (%ld, %ld, %08lx)\n",
-                    p->domain, page_type_count(pf), 
-                    page_tot_count(pf), pf->flags);
+            DPRINTK("Bad page free for domain %d\n", p->domain);
              rc = -EINVAL;
              goto out;
          }
  
-        need_flush |= pf->flags & PG_need_flush;
-
-        pf->flags = 0;
+        if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
+            put_page_and_type(page);
  
-        list_del(&pf->list);
-        list_add(&pf->list, &free_list);
-        free_pfns++;
+        if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
+            put_page(page);
  
-        p->tot_pages--;
+        put_page(page);
      }
  
   out:
-    spin_unlock(&p->page_lock);
-    spin_unlock_irqrestore(&free_list_lock, flags);
-    
      if ( need_flush )
      {
          __flush_tlb();
-        perfc_incrc(need_flush_tlb_flush);
+        perfc_incr(need_flush_tlb_flush);
      }
  
      return rc ? rc : op.size;
diff --git a/xen/common/domain.c b/xen/common/domain.c

index eae232206bac669cc7c2356e2ee65aff2a3d6f7c..4f23778e467273c5a14aa648c11b8df0caafae48 100644 (file)
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -51,12 +51,11 @@ struct task_struct *do_createdomain(unsigned int dom_id, unsigned int cpu)
      sprintf(p->name, "Domain-%d", dom_id);
  
      spin_lock_init(&p->blk_ring_lock);
-    spin_lock_init(&p->page_lock);
      spin_lock_init(&p->event_channel_lock);
  
      p->shared_info = (void *)get_free_page(GFP_KERNEL);
      memset(p->shared_info, 0, PAGE_SIZE);
-    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), dom_id);
+    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p);
  
      p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
      memset(p->mm.perdomain_pt, 0, PAGE_SIZE);
@@ -67,8 +66,10 @@ struct task_struct *do_createdomain(unsigned int dom_id, unsigned int cpu)
  
      sched_add_domain(p);
  
-    INIT_LIST_HEAD(&p->pg_head);
+    spin_lock_init(&p->page_list_lock);
+    INIT_LIST_HEAD(&p->page_list);
      p->max_pages = p->tot_pages = 0;
+
      write_lock_irqsave(&tasklist_lock, flags);
      SET_LINKS(p);
      p->next_hash = task_hash[TASK_HASH(dom_id)];
@@ -218,77 +219,203 @@ long stop_other_domain(unsigned int dom)
      return 0;
  }
  
-unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
+struct pfn_info *alloc_domain_page(struct task_struct *p)
  {
-    struct list_head *temp;
-    struct pfn_info *pf;
-    unsigned int alloc_pfns;
-    unsigned int req_pages;
-    unsigned long flags;
-
-    /* how many pages do we need to alloc? */
-    req_pages = kbytes >> (PAGE_SHIFT - 10);
+    struct pfn_info *page = NULL;
+    unsigned long flags, mask, pfn_stamp, cpu_stamp;
+    int i;
  
      spin_lock_irqsave(&free_list_lock, flags);
-    
-    /* is there enough mem to serve the request? */   
-    if ( (req_pages + (SLACK_DOMAIN_MEM_KILOBYTES >> (PAGE_SHIFT-10))) >
-         free_pfns )
+    if ( likely(!list_empty(&free_list)) )
      {
-        spin_unlock_irqrestore(&free_list_lock, flags);
-        return -1;
+        page = list_entry(free_list.next, struct pfn_info, list);
+        list_del(&page->list);
+        free_pfns--;
      }
+    spin_unlock_irqrestore(&free_list_lock, flags);
+
+    if ( unlikely(page == NULL) )
+        return NULL;
  
-    /* allocate pages and build a thread through frame_table */
-    temp = free_list.next;
-    for ( alloc_pfns = 0; alloc_pfns < req_pages; alloc_pfns++ )
+    if ( unlikely((mask = page->u.cpu_mask) != 0) )
      {
-        pf = list_entry(temp, struct pfn_info, list);
-        pf->flags = p->domain;
-        set_page_type_count(pf, 0);
-        set_page_tot_count(pf, 0);
-        temp = temp->next;
-        list_del(&pf->list);
-        list_add_tail(&pf->list, &p->pg_head);
-        free_pfns--;
-        ASSERT(free_pfns != 0);
+        pfn_stamp = page->tlbflush_timestamp;
+        for ( i = 0; mask != 0; i++ )
+        {
+            if ( unlikely(mask & (1<<i)) )
+            {
+                cpu_stamp = tlbflush_time[i];
+                if ( !NEED_FLUSH(cpu_stamp, pfn_stamp) )
+                    mask &= ~(1<<i);
+            }
+        }
+
+        if ( unlikely(mask != 0) )
+        {
+            if ( unlikely(in_irq()) )
+            {
+                DPRINTK("Returning NULL from alloc_domain_page: in_irq\n");
+                goto free_and_exit;
+            }
+            perfc_incrc(need_flush_tlb_flush);
+            flush_tlb_mask(mask);
+        }
      }
-   
-    spin_unlock_irqrestore(&free_list_lock, flags);
-    
-    p->tot_pages = req_pages;
  
-    /* TEMPORARY: max_pages should be explicitly specified. */
-    p->max_pages = p->tot_pages;
+    page->u.domain = p;
+    page->type_and_flags = 0;
+    if ( p != NULL )
+    {
+        if ( unlikely(in_irq()) )
+            BUG();
+        wmb(); /* Domain pointer must be visible before updating refcnt. */
+        spin_lock(&p->page_list_lock);
+        if ( unlikely(p->tot_pages >= p->max_pages) )
+        {
+            spin_unlock(&p->page_list_lock);
+            goto free_and_exit;
+        }
+        list_add_tail(&page->list, &p->page_list);
+        p->tot_pages++;
+        page->count_and_flags = PGC_allocated | 1;
+        spin_unlock(&p->page_list_lock);
+    }
  
-    return 0;
+    return page;
+
+ free_and_exit:
+    spin_lock_irqsave(&free_list_lock, flags);
+    list_add(&page->list, &free_list);
+    free_pfns++;
+    spin_unlock_irqrestore(&free_list_lock, flags);
+    return NULL;
  }
- 
  
-void free_all_dom_mem(struct task_struct *p)
+void free_domain_page(struct pfn_info *page)
  {
-    struct list_head *ent;
      unsigned long flags;
+    struct task_struct *p = page->u.domain;
  
-    spin_lock_irqsave(&free_list_lock, flags);
-    while ( (ent = p->pg_head.next) != &p->pg_head )
+    if ( unlikely(in_irq()) )
+        BUG();
+
+    if ( likely(!IS_XEN_HEAP_FRAME(page)) )
      {
-        struct pfn_info *pf = list_entry(ent, struct pfn_info, list);
-        set_page_type_count(pf, 0);
-        set_page_tot_count(pf, 0);
-        pf->flags = 0;
-        ASSERT(ent->next->prev == ent);
-        ASSERT(ent->prev->next == ent);
-        list_del(ent);
-        list_add(ent, &free_list);
+        /*
+         * No race with setting of zombie bit. If it wasn't set before the
+         * last reference was dropped, then it can't be set now.
+         */
+        page->u.cpu_mask = 0;
+        if ( !(page->count_and_flags & PGC_zombie) )
+        {
+            page->tlbflush_timestamp = tlbflush_clock;
+            page->u.cpu_mask = 1 << p->processor;
+
+            spin_lock(&p->page_list_lock);
+            list_del(&page->list);
+            p->tot_pages--;
+            spin_unlock(&p->page_list_lock);
+        }
+
+        page->count_and_flags = 0;
+
+        spin_lock_irqsave(&free_list_lock, flags);
+        list_add(&page->list, &free_list);
          free_pfns++;
+        spin_unlock_irqrestore(&free_list_lock, flags);
      }
-    spin_unlock_irqrestore(&free_list_lock, flags);
+    else
+    {
+        /*
+         * No need for a TLB flush. Non-domain pages are always co-held by Xen,
+         * and the Xen reference is not dropped until the domain is dead.
+         * DOM0 may hold references, but it's trusted so no need to flush.
+         */
+        page->u.cpu_mask = 0;
+        page->count_and_flags = 0;
+        free_page((unsigned long)page_to_virt(page));
+    }
+}
+
+
+void free_all_dom_mem(struct task_struct *p)
+{
+    struct list_head *ent, zombies;
+    struct pfn_info *page;
+
+    INIT_LIST_HEAD(&zombies);
+
+    spin_lock(&p->page_list_lock);
+    while ( (ent = p->page_list.next) != &p->page_list )
+    {
+        page = list_entry(ent, struct pfn_info, list);
+
+        if ( unlikely(!get_page(page, p)) )
+        {
+            /*
+             * Another CPU has dropped the last reference and is responsible 
+             * for removing the page from this list. Wait for them to do so.
+             */
+            spin_unlock(&p->page_list_lock);
+            while ( p->page_list.next == ent )
+                barrier();
+            spin_lock(&p->page_list_lock);
+            continue;
+        }
+
+        set_bit(_PGC_zombie, &page->count_and_flags);
+
+        list_del(&page->list);
+        p->tot_pages--;
+
+        list_add(&page->list, &zombies);
+    }
+    spin_unlock(&p->page_list_lock);
+
+    /* We do the potentially complex 'put' operations with no lock held. */
+    while ( (ent = zombies.next) != &zombies )
+    {
+        page = list_entry(ent, struct pfn_info, list);
  
-    p->tot_pages = 0;
+        list_del(&page->list);
+        
+        if ( test_and_clear_bit(_PGC_guest_pinned, &page->count_and_flags) )
+            put_page_and_type(page);
+
+        if ( test_and_clear_bit(_PGC_allocated, &page->count_and_flags) )
+            put_page(page);
+
+        put_page(page);
+    }
  }
  
  
+unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
+{
+    unsigned int alloc_pfns, nr_pages;
+
+    nr_pages = kbytes >> (PAGE_SHIFT - 10);
+
+    /* TEMPORARY: max_pages should be explicitly specified. */
+    p->max_pages = nr_pages;
+
+    for ( alloc_pfns = 0; alloc_pfns < nr_pages; alloc_pfns++ )
+    {
+        if ( unlikely(alloc_domain_page(p) == NULL) ||
+             unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
+                                   (PAGE_SHIFT-10))) )
+        {
+            free_all_dom_mem(p);
+            return -1;
+        }
+    }
+
+    p->tot_pages = nr_pages;
+
+    return 0;
+}
+ 
+
  /* Release resources belonging to task @p. */
  void release_task(struct task_struct *p)
  {
@@ -309,7 +436,6 @@ void release_task(struct task_struct *p)
      destroy_event_channels(p);
      free_page((unsigned long)p->mm.perdomain_pt);
      UNSHARE_PFN(virt_to_page(p->shared_info));
-    free_page((unsigned long)p->shared_info);
      free_all_dom_mem(p);
  
      kmem_cache_free(task_struct_cachep, p);
@@ -360,11 +486,10 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain)
      p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs;
      p->failsafe_address  = builddomain->ctxt.failsafe_callback_eip;
      
-    /* NB. Page base must already be pinned! */
      phys_l2tab = builddomain->ctxt.pt_base;
      p->mm.pagetable = mk_pagetable(phys_l2tab);
-    get_page_type(&frame_table[phys_l2tab>>PAGE_SHIFT]);
-    get_page_tot(&frame_table[phys_l2tab>>PAGE_SHIFT]);
+    get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, 
+                      PGT_l2_page_table);
  
      /* Set up the shared info structure. */
      update_dom_time(p->shared_info);
@@ -449,7 +574,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
          return -ENOMEM;
      }
  
-    alloc_address = list_entry(p->pg_head.prev, struct pfn_info, list) -
+    alloc_address = list_entry(p->page_list.prev, struct pfn_info, list) -
          frame_table;
      alloc_address <<= PAGE_SHIFT;
      alloc_index = p->tot_pages;
@@ -497,7 +622,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
      p->mm.pagetable = mk_pagetable(phys_l2tab);
  
      l2tab += l2_table_offset(virt_load_address);
-    cur_address = list_entry(p->pg_head.next, struct pfn_info, list) -
+    cur_address = list_entry(p->page_list.next, struct pfn_info, list) -
          frame_table;
      cur_address <<= PAGE_SHIFT;
      for ( count = 0; count < p->tot_pages; count++ )
@@ -514,10 +639,10 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
          }
          *l1tab++ = mk_l1_pgentry(cur_address|L1_PROT);
          
-        page = frame_table + (cur_address >> PAGE_SHIFT);
-        page->flags = dom | PGT_writeable_page | PG_need_flush;
-        set_page_type_count(page, 1);
-        set_page_tot_count(page, 1);
+        page = &frame_table[cur_address >> PAGE_SHIFT];
+        set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
+        if ( !get_page_and_type(page, p, PGT_writeable_page) )
+            BUG();
          /* Set up the MPT entry. */
          machine_to_phys_mapping[cur_address >> PAGE_SHIFT] = count;
  
@@ -538,8 +663,9 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
      {
          *l1tab = mk_l1_pgentry(l1_pgentry_val(*l1tab) & ~_PAGE_RW);
          page = frame_table + l1_pgentry_to_pagenr(*l1tab);
-        page->flags = dom | PGT_l1_page_table;
-        get_page_tot(page);
+        page->type_and_flags &= ~PGT_type_mask;
+        page->type_and_flags |= PGT_l1_page_table;
+        get_page(page, p); /* an extra ref because of readable mapping */
          l1tab++;
          if( !((unsigned long)l1tab & (PAGE_SIZE - 1)) )
          {
@@ -548,9 +674,13 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
              l2tab++;
          }
      }
-    get_page_type(page); /* guest_pinned */
-    get_page_tot(page);  /* guest_pinned */
-    page->flags = dom | PG_guest_pinned | PGT_l2_page_table;
+    /* Rewrite last L1 page to be a L2 page. */
+    page->type_and_flags &= ~PGT_type_mask;
+    page->type_and_flags |= PGT_l2_page_table;
+    /* Get another ref to L2 page so that it can be pinned. */
+    if ( !get_page_and_type(page, p, PGT_l2_page_table) )
+        BUG();
+    set_bit(_PGC_guest_pinned, &page->count_and_flags);
      unmap_domain_mem(l1start);
  
      /* Set up shared info area. */
@@ -565,7 +695,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
  
      /* Install the new page tables. */
      __cli();
-    __write_cr3_counted(pagetable_val(p->mm.pagetable));
+    write_cr3_counted(pagetable_val(p->mm.pagetable));
  
      /* Copy the guest OS image. */    
      src  = (char *)(phy_data_start + 12);
@@ -632,7 +762,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params,
  
  
      /* Reinstate the caller's page tables. */
-    __write_cr3_counted(pagetable_val(current->mm.pagetable));
+    write_cr3_counted(pagetable_val(current->mm.pagetable));
      __sti();
  
      p->flags |= PF_CONSTRUCTED;
diff --git a/xen/common/kernel.c b/xen/common/kernel.c

index 9f6fb745561a5f2fffdadcf837816ea058e55131..1bba43d7bec2477e281b08f911c3fa23ac427f1e 100644 (file)
--- a/xen/common/kernel.c
+++ b/xen/common/kernel.c
@@ -181,6 +181,13 @@ void cmain (unsigned long magic, multiboot_info_t *mbi)
          for ( ; ; ) ;
      }
  
+    /* The array of pfn_info structures must fit into the reserved area. */
+    if ( sizeof(struct pfn_info) > 24 )
+    {
+        printk("'struct pfn_info' too large to fit in Xen address space!\n");
+        for ( ; ; ) ;
+    }
+
      set_current(&idle0_task);
  
      max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10);
diff --git a/xen/common/memory.c b/xen/common/memory.c

index 8cbb503cf31e2f8b3ceeaeeedb1ebd210921d478..c2b4ee9f7ad0d6137fa996b28c7d16a487d21ea4 100644 (file)
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -139,34 +139,28 @@
  #include <asm/uaccess.h>
  #include <asm/domain_page.h>
  
-#if 0
-#define MEM_LOG(_f, _a...) 
+#ifndef NDEBUG
+#define MEM_LOG(_f, _a...)                           \
    printk("DOM%d: (file=memory.c, line=%d) " _f "\n", \
           current->domain, __LINE__, ## _a )
  #else
  #define MEM_LOG(_f, _a...) ((void)0)
  #endif
  
-/* Domain 0 is allowed to submit requests on behalf of others. */
-#define DOMAIN_OKAY(_f) \
-    ((((_f) & PG_domain_mask) == current->domain) || (current->domain == 0))
-
-/* 'get' checks parameter for validity before inc'ing refcnt. */
-static int get_l2_table(unsigned long page_nr);
-static int get_l1_table(unsigned long page_nr);
-static int get_page(unsigned long page_nr, int writeable);
-static int inc_page_refcnt(unsigned long page_nr, unsigned int type);
-/* 'put' does no checking because if refcnt not zero, entity must be valid. */
-static void put_l2_table(unsigned long page_nr);
-static void put_l1_table(unsigned long page_nr);
-static void put_page(unsigned long page_nr, int writeable);
-static int dec_page_refcnt(unsigned long page_nr, unsigned int type);
-
-static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t);
+static int alloc_l2_table(struct pfn_info *page);
+static int alloc_l1_table(struct pfn_info *page);
+static int get_page_from_pagenr(unsigned long page_nr);
+static int get_page_and_type_from_pagenr(unsigned long page_nr, 
+                                         unsigned int type);
+
+static void free_l2_table(struct pfn_info *page);
+static void free_l1_table(struct pfn_info *page);
+
+static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long);
  static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t);
  
  /* frame table size and its size in pages */
-frame_table_t * frame_table;
+struct pfn_info *frame_table;
  unsigned long frame_table_size;
  unsigned long max_page;
  
@@ -176,8 +170,11 @@ unsigned int free_pfns;
  
  /* Used to defer flushing of memory structures. */
  static struct {
-    int flush_tlb;
-    int refresh_ldt;
+#define DOP_FLUSH_TLB   (1<<0) /* Flush the TLB.                 */
+#define DOP_RELOAD_LDT  (1<<1) /* Reload the LDT shadow mapping. */
+#define DOP_RESTORE_CR0 (1<<2) /* Set the WP bit in CR0.         */
+    unsigned long flags;
+    unsigned long cr0;
  } deferred_op[NR_CPUS] __cacheline_aligned;
  
  /*
@@ -196,7 +193,7 @@ void __init init_frametable(unsigned long nr_pages)
      max_page = nr_pages;
      frame_table_size = nr_pages * sizeof(struct pfn_info);
      frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK;
-    frame_table = (frame_table_t *)FRAMETABLE_VIRT_START;
+    frame_table = (struct pfn_info *)FRAMETABLE_VIRT_START;
      memset(frame_table, 0, frame_table_size);
  
      free_pfns = 0;
@@ -218,7 +215,7 @@ void __init init_frametable(unsigned long nr_pages)
  
  static void __invalidate_shadow_ldt(struct task_struct *p)
  {
-    int i, cpu = p->processor;
+    int i;
      unsigned long pfn;
      struct pfn_info *page;
      
@@ -230,16 +227,13 @@ static void __invalidate_shadow_ldt(struct task_struct *p)
          if ( pfn == 0 ) continue;
          p->mm.perdomain_pt[i] = mk_l1_pgentry(0);
          page = frame_table + pfn;
-        ASSERT((page->flags & PG_type_mask) == PGT_ldt_page);
-        ASSERT((page->flags & PG_domain_mask) == p->domain);
-        ASSERT((page_type_count(page) != 0) && (page_tot_count(page) != 0));
-        put_page_type(page);
-        put_page_tot(page);                
+        ASSERT_PAGE_IS_TYPE(page, PGT_ldt_page);
+        ASSERT_PAGE_IS_DOMAIN(page, p);
+        put_page_and_type(page);
      }
  
      /* Dispose of the (now possibly invalid) mappings from the TLB.  */
-    deferred_op[cpu].flush_tlb   = 1;
-    deferred_op[cpu].refresh_ldt = 1;
+    deferred_op[p->processor].flags |= DOP_FLUSH_TLB | DOP_RELOAD_LDT;
  }
  
  
@@ -251,556 +245,614 @@ static inline void invalidate_shadow_ldt(void)
  }
  
  
+int alloc_segdesc_page(struct pfn_info *page)
+{
+    unsigned long *descs = map_domain_mem((page-frame_table) << PAGE_SHIFT);
+    int i;
+
+    for ( i = 0; i < 512; i++ )
+        if ( unlikely(!check_descriptor(descs[i*2], descs[i*2+1])) )
+            goto fail;
+
+    unmap_domain_mem(descs);
+    return 1;
+
+ fail:
+    unmap_domain_mem(descs);
+    return 0;
+}
+
+
  /* Map shadow page at offset @off. Returns 0 on success. */
  int map_ldt_shadow_page(unsigned int off)
  {
      struct task_struct *p = current;
-    unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT);
-    unsigned long l1e, *ldt_page;
-    struct pfn_info *page;
-    int i, ret = -1;
+    unsigned long l1e;
  
-    /* We cannot take a page_lock in interrupt context. */
-    if ( in_interrupt() )
+    if ( unlikely(in_interrupt()) )
          BUG();
  
-    spin_lock(&p->page_lock);
+    __get_user(l1e, (unsigned long *)&linear_pg_table[(p->mm.ldt_base >> 
+                                                       PAGE_SHIFT) + off]);
  
-    __get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT)));
-    if ( unlikely(!(l1e & _PAGE_PRESENT)) )
-        goto out;
+    if ( unlikely(!(l1e & _PAGE_PRESENT)) ||
+         unlikely(!get_page_and_type(&frame_table[l1e >> PAGE_SHIFT], 
+                                     p, PGT_ldt_page)) )
+        return 0;
  
-    page = frame_table + (l1e >> PAGE_SHIFT);
-    if ( unlikely((page->flags & PG_type_mask) != PGT_ldt_page) )
-    {
-        if ( unlikely(page_type_count(page) != 0) )
-            goto out;
-
-        /* Check all potential LDT entries in the page. */
-        ldt_page = (unsigned long *)addr;
-        for ( i = 0; i < 512; i++ )
-            if ( unlikely(!check_descriptor(ldt_page[i*2], ldt_page[i*2+1])) )
-                goto out;
-        if ( unlikely(page->flags & PG_need_flush) )
-        {
-            perfc_incrc(need_flush_tlb_flush);
-            __write_cr3_counted(pagetable_val(p->mm.pagetable));
-            page->flags &= ~PG_need_flush;
-        }
+    p->mm.perdomain_pt[off + 16] = mk_l1_pgentry(l1e | _PAGE_RW);
+    p->mm.shadow_ldt_mapcnt++;
  
-        page->flags &= ~PG_type_mask;
-        page->flags |= PGT_ldt_page;
-    }
+    return 1;
+}
  
-    /* Success! */
-    get_page_type(page);
-    get_page_tot(page);
-    p->mm.perdomain_pt[off+16] = mk_l1_pgentry(l1e|_PAGE_RW);
-    p->mm.shadow_ldt_mapcnt++;
  
-    ret = 0;
+/* Domain 0 is allowed to build page tables on others' behalf. */
+static inline int dom0_get_page(struct pfn_info *page)
+{
+    unsigned long x, nx, y = page->count_and_flags;
+
+    do {
+        x  = y;
+        nx = x + 1;
+        if ( unlikely((x & PGC_count_mask) == 0) ||
+             unlikely((nx & PGC_count_mask) == 0) )
+            return 0;
+    }
+    while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
  
- out:
-    spin_unlock(&p->page_lock);
-    return ret;
+    return 1;
  }
  
  
-/* Return original refcnt, or -1 on error. */
-static int inc_page_refcnt(unsigned long page_nr, unsigned int type)
+static int get_page_from_pagenr(unsigned long page_nr)
  {
-    struct pfn_info *page;
-    unsigned long flags;
+    struct pfn_info *page = &frame_table[page_nr];
  
      if ( unlikely(page_nr >= max_page) )
      {
          MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
-        return -1;
+        return 0;
      }
-    page = frame_table + page_nr;
-    flags = page->flags;
-    if ( unlikely(!DOMAIN_OKAY(flags)) )
+
+    if ( unlikely(!get_page(page, current)) &&
+         ((current->domain != 0) || !dom0_get_page(page)) )
      {
-        MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
-        return -1;
+        MEM_LOG("Could not get page reference for pfn %08lx\n", page_nr);
+        return 0;
      }
-    if ( (flags & PG_type_mask) != type )
+
+    return 1;
+}
+
+
+static int get_page_and_type_from_pagenr(unsigned long page_nr, 
+                                         unsigned int type)
+{
+    struct pfn_info *page = &frame_table[page_nr];
+
+    if ( unlikely(!get_page_from_pagenr(page_nr)) )
+        return 0;
+
+    if ( unlikely(!get_page_type(page, type)) )
      {
-        if ( page_type_count(page) != 0 )
-        {
-            MEM_LOG("Page %08lx bad type/count (%08lx!=%08x) cnt=%ld",
-                    page_nr << PAGE_SHIFT,
-                    flags & PG_type_mask, type, page_type_count(page));
-            return -1;
-        }
+        MEM_LOG("Bad page type for pfn %08lx (%08lx)", 
+                page_nr, page->type_and_flags);
+        put_page(page);
+        return 0;
+    }
  
-        if ( unlikely(flags & PG_need_flush) )
-        {
-            deferred_op[smp_processor_id()].flush_tlb = 1;
-            page->flags &= ~PG_need_flush;
-            perfc_incrc(need_flush_tlb_flush);
-        }
+    return 1;
+}
+
+
+/*
+ * We allow an L2 table to map itself, to achieve a linear p.t. Note that this
+ * does not raise any reference counts.
+ */
+static int check_linear_pagetable(l2_pgentry_t l2e, unsigned long pfn)
+{
+    if ( (l2_pgentry_val(l2e) & _PAGE_RW) )
+    {
+        MEM_LOG("Attempt to create linear p.t. with write perms");
+        return 0;
+    }
  
-        page->flags &= ~PG_type_mask;
-        page->flags |= type;
+    if ( (l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn )
+    {
+        MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
+        return 0;
      }
  
-    get_page_tot(page);
-    return get_page_type(page);
+    return 1;
  }
  
  
-/* Return new refcnt, or -1 on error. */
-static int dec_page_refcnt(unsigned long page_nr, unsigned int type)
+static int get_page_from_l1e(l1_pgentry_t l1e)
  {
-    struct pfn_info *page;
+    ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT);
  
-    if ( unlikely(page_nr >= max_page) )
+    if ( unlikely((l1_pgentry_val(l1e) &
+                   (_PAGE_GLOBAL|_PAGE_PAT))) )
      {
-        MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
-        return -1;
+        MEM_LOG("Bad L1 page type settings %04lx",
+                l1_pgentry_val(l1e) &
+                (_PAGE_GLOBAL|_PAGE_PAT));
+        return 0;
      }
-    page = frame_table + page_nr;
-    if ( unlikely(!DOMAIN_OKAY(page->flags)) || 
-         unlikely(((page->flags & PG_type_mask) != type)) ) 
+
+    if ( l1_pgentry_val(l1e) & _PAGE_RW )
      {
-        MEM_LOG("Bad page type/domain (dom=%ld) (type %ld != expected %d)",
-                page->flags & PG_domain_mask, page->flags & PG_type_mask,
-                type);
-        return -1;
+        if ( unlikely(!get_page_and_type_from_pagenr(
+            l1_pgentry_to_pagenr(l1e), PGT_writeable_page)) )
+            return 0;
+        set_bit(_PGC_tlb_flush_on_type_change, 
+                &frame_table[l1_pgentry_to_pagenr(l1e)].count_and_flags);
      }
-    ASSERT(page_type_count(page) != 0);
-    put_page_tot(page);
-    return put_page_type(page);
+    else
+    {
+        if ( unlikely(!get_page_from_pagenr(l1_pgentry_to_pagenr(l1e))) )
+            return 0;
+    }
+
+    return 1;
  }
  
  
-/* We allow a L2 table to map itself, to achieve a linear pagetable. */
-/* NB. There's no need for a put_twisted_l2_table() function!! */
-static int get_twisted_l2_table(unsigned long entry_pfn, l2_pgentry_t l2e)
+/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
+static int get_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  {
-    unsigned long l2v = l2_pgentry_val(l2e);
+    ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT);
  
-    /* Clearly the mapping must be read-only :-) */
-    if ( (l2v & _PAGE_RW) )
+    if ( unlikely((l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE))) )
      {
-        MEM_LOG("Attempt to install twisted L2 entry with write permissions");
-        return -1;
+        MEM_LOG("Bad L2 page type settings %04lx",
+                l2_pgentry_val(l2e) & (_PAGE_GLOBAL|_PAGE_PSE));
+        return 0;
      }
  
-    /* This is a sufficient final check. */
-    if ( (l2v >> PAGE_SHIFT) != entry_pfn )
+    if ( unlikely(!get_page_and_type_from_pagenr(
+        l2_pgentry_to_pagenr(l2e), PGT_l1_page_table)) &&
+         unlikely(!check_linear_pagetable(l2e, pfn)) )
+        return 0;
+
+    return 1;
+}
+
+
+static void put_page_from_l1e(l1_pgentry_t l1e)
+{
+    struct pfn_info *page;
+
+    ASSERT(l1_pgentry_val(l1e) & _PAGE_PRESENT);
+
+    page = &frame_table[l1_pgentry_to_pagenr(l1e)];
+
+    if ( l1_pgentry_val(l1e) & _PAGE_RW )
      {
-        MEM_LOG("L2 tables may not map _other_ L2 tables!\n");
-        return -1;
+        put_page_and_type(page);
+    }
+    else
+    {
+        /* We expect this is rare so we blow the entire shadow LDT. */
+        if ( unlikely(((page->type_and_flags & PGT_type_mask) == 
+                       PGT_ldt_page)) &&
+             unlikely(((page->type_and_flags & PGT_count_mask) != 0)) )
+            invalidate_shadow_ldt();
+        put_page(page);
      }
-    
-    /* We don't bump the reference counts. */
-    return 0;
  }
  
  
-static int get_l2_table(unsigned long page_nr)
+/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
+static void put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
  {
-    struct pfn_info *page;
-    struct task_struct *p;
-    l2_pgentry_t *p_l2_entry, l2_entry;
-    int i, ret=0;
+    ASSERT(l2_pgentry_val(l2e) & _PAGE_PRESENT);
+
+    if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) && 
+         ((l2_pgentry_val(l2e) >> PAGE_SHIFT) != pfn) )
+        put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
+}
+
+
+static int alloc_l2_table(struct pfn_info *page)
+{
+    unsigned long page_nr = page - frame_table;
+    l2_pgentry_t *pl2e, l2e;
+    int i;
     
-    ret = inc_page_refcnt(page_nr, PGT_l2_page_table);
-    if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
-    
-    /* NEW level-2 page table! Deal with every PDE in the table. */
-    p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
+    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
+
      for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
      {
-        l2_entry = *p_l2_entry++;
-        if ( !(l2_pgentry_val(l2_entry) & _PAGE_PRESENT) ) continue;
-        if ( unlikely((l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE))) )
-        {
-            MEM_LOG("Bad L2 page type settings %04lx",
-                    l2_pgentry_val(l2_entry) & (_PAGE_GLOBAL|_PAGE_PSE));
-            ret = -1;
+        l2e = pl2e[i];
+
+        if ( !(l2_pgentry_val(l2e) & _PAGE_PRESENT) ) 
+            continue;
+
+        if ( unlikely(!get_page_from_l2e(l2e, page_nr)) )
              goto fail;
-        }
-        /* Assume we're mapping an L1 table, falling back to twisted L2. */
-        ret = get_l1_table(l2_pgentry_to_pagenr(l2_entry));
-        if ( unlikely(ret) ) ret = get_twisted_l2_table(page_nr, l2_entry);
-        if ( unlikely(ret) ) goto fail;
      }
      
-    /* Now we simply slap in our high mapping. */
-    memcpy(p_l2_entry, 
+    /* Now we add our private high mappings. */
+    memcpy(&pl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 
             &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
             HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
-    p_l2_entry[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) -
-              DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
+    pl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
          mk_l2_pgentry((page_nr << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+    pl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+        mk_l2_pgentry(__pa(page->u.domain->mm.perdomain_pt) | 
+                      __PAGE_HYPERVISOR);
  
-    /*
-     * The per-domain PGD is slightly tricky, as we may not be executing
-     * in the context of the correct domain (DOM0 builds pt's for others).
-     */
-    page = frame_table + page_nr;
-    if ( (p = find_domain_by_id(page->flags & PG_domain_mask)) != NULL )
-    {
-        p_l2_entry[(PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT) -
-                  DOMAIN_ENTRIES_PER_L2_PAGETABLE] =
-            mk_l2_pgentry(__pa(p->mm.perdomain_pt) | __PAGE_HYPERVISOR);
-        put_task_struct(p);
-    }
-
- out:
-    unmap_domain_mem(p_l2_entry);
-    return ret;
+    unmap_domain_mem(pl2e);
+    return 1;
  
   fail:
-    p_l2_entry--;
      while ( i-- > 0 )
      {
-        l2_entry = *--p_l2_entry;
-        if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
-            put_l1_table(l2_pgentry_to_pagenr(l2_entry));
+        l2e = pl2e[i];
+        if ( l2_pgentry_val(l2e) & _PAGE_PRESENT )
+            put_page_from_l2e(l2e, page_nr);
      }
-    if ( dec_page_refcnt(page_nr, PGT_l2_page_table) != 0 )
-        BUG();
-    goto out;
+
+    unmap_domain_mem(pl2e);
+    return 0;
  }
  
  
-static int get_l1_table(unsigned long page_nr)
+static int alloc_l1_table(struct pfn_info *page)
  {
-    l1_pgentry_t *p_l1_entry, l1_entry;
-    int i, ret;
+    unsigned long page_nr = page - frame_table;
+    l1_pgentry_t *pl1e, l1e;
+    int i;
  
-    /* Update ref count for page pointed at by PDE. */
-    ret = inc_page_refcnt(page_nr, PGT_l1_page_table);
-    if ( likely(ret != 0) ) return (ret < 0) ? ret : 0;
+    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  
-    /* NEW level-1 page table! Deal with every PTE in the table. */
-    p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
      for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
      {
-        l1_entry = *p_l1_entry++;
-        if ( !(l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) continue;
-        if ( unlikely((l1_pgentry_val(l1_entry) &
-                       (_PAGE_GLOBAL|_PAGE_PAT))) )
-        {
-            MEM_LOG("Bad L1 page type settings %04lx",
-                    l1_pgentry_val(l1_entry) &
-                    (_PAGE_GLOBAL|_PAGE_PAT));
-            ret = -1;
+        l1e = pl1e[i];
+
+        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 
+            continue;
+
+        if ( unlikely(!get_page_from_l1e(l1e)) )
              goto fail;
-        }
-        ret = get_page(l1_pgentry_to_pagenr(l1_entry),
-                       l1_pgentry_val(l1_entry) & _PAGE_RW);
-        if ( unlikely(ret) ) goto fail;
      }
  
      /* Make sure we unmap the right page! */
-    unmap_domain_mem(p_l1_entry-1);
-    return ret;
+    unmap_domain_mem(pl1e);
+    return 1;
  
   fail:
-    p_l1_entry--;
      while ( i-- > 0 )
      {
-        l1_entry = *--p_l1_entry;
-        if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) 
-            put_page(l1_pgentry_to_pagenr(l1_entry), 
-                     l1_pgentry_val(l1_entry) & _PAGE_RW);
-    }
-    if ( dec_page_refcnt(page_nr, PGT_l1_page_table) != 0 )
-        BUG();
-    unmap_domain_mem(p_l1_entry);
-    return ret;
-}
-
-
-static int get_page(unsigned long page_nr, int writeable)
-{
-    struct pfn_info *page;
-    unsigned long flags;
-
-    /* Update ref count for page pointed at by PTE. */
-    if ( unlikely(page_nr >= max_page) )
-    {
-        MEM_LOG("Page out of range (%08lx>%08lx)", page_nr, max_page);
-        return(-1);
-    }
-    page = frame_table + page_nr;
-    flags = page->flags;
-    if ( unlikely(!DOMAIN_OKAY(flags)) )
-    {
-        MEM_LOG("Bad page domain (%ld)", flags & PG_domain_mask);
-        return(-1);
+        l1e = pl1e[i];
+        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) )
+            continue;
+        put_page_from_l1e(l1e);
      }
  
-    if ( writeable )
-    {
-        if ( (flags & PG_type_mask) != PGT_writeable_page )
-        {
-            if ( page_type_count(page) != 0 )
-            {
-                MEM_LOG("Bad page type/count (%08lx!=%08x) cnt=%ld",
-                        flags & PG_type_mask, PGT_writeable_page,
-                        page_type_count(page));
-                return(-1);
-            }
-            page->flags &= ~PG_type_mask;
-            page->flags |= PGT_writeable_page;
-        }
-        page->flags |= PG_need_flush;
-        get_page_type(page);
-    }
-
-    get_page_tot(page);
-    
-    return(0);
+    unmap_domain_mem(pl1e);
+    return 0;
  }
  
  
-static void put_l2_table(unsigned long page_nr)
+static void free_l2_table(struct pfn_info *page)
  {
-    l2_pgentry_t *p_l2_entry, l2_entry;
+    unsigned long page_nr = page - frame_table;
+    l2_pgentry_t *pl2e, l2e;
      int i;
  
-    if ( likely(dec_page_refcnt(page_nr, PGT_l2_page_table)) ) return;
+    pl2e = map_domain_mem(page_nr << PAGE_SHIFT);
  
-    /* We had last reference to level-2 page table. Free the PDEs. */
-    p_l2_entry = map_domain_mem(page_nr << PAGE_SHIFT);
      for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
      {
-        l2_entry = *p_l2_entry++;
-        if ( (l2_pgentry_val(l2_entry) & _PAGE_PRESENT) )
-            put_l1_table(l2_pgentry_to_pagenr(l2_entry));
+        l2e = pl2e[i];
+        if ( (l2_pgentry_val(l2e) & _PAGE_PRESENT) &&
+             unlikely((l2_pgentry_val(l2e) >> PAGE_SHIFT) != page_nr) )
+            put_page_and_type(&frame_table[l2_pgentry_to_pagenr(l2e)]);
      }
  
-    unmap_domain_mem(p_l2_entry);
+    unmap_domain_mem(pl2e);
  }
  
  
-static void put_l1_table(unsigned long page_nr)
+static void free_l1_table(struct pfn_info *page)
  {
-    l1_pgentry_t *p_l1_entry, l1_entry;
+    unsigned long page_nr = page - frame_table;
+    l1_pgentry_t *pl1e, l1e;
      int i;
  
-    if ( likely(dec_page_refcnt(page_nr, PGT_l1_page_table)) ) return;
+    pl1e = map_domain_mem(page_nr << PAGE_SHIFT);
  
-    /* We had last reference to level-1 page table. Free the PTEs. */
-    p_l1_entry = map_domain_mem(page_nr << PAGE_SHIFT);
      for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
      {
-        l1_entry = *p_l1_entry++;
-        if ( (l1_pgentry_val(l1_entry) & _PAGE_PRESENT) ) 
-            put_page(l1_pgentry_to_pagenr(l1_entry), 
-                     l1_pgentry_val(l1_entry) & _PAGE_RW);
+        l1e = pl1e[i];
+        if ( !(l1_pgentry_val(l1e) & _PAGE_PRESENT) ) 
+            continue;
+        put_page_from_l1e(l1e);
      }
  
-    /* Make sure we unmap the right page! */
-    unmap_domain_mem(p_l1_entry-1);
+    unmap_domain_mem(pl1e);
  }
  
  
-static void put_page(unsigned long page_nr, int writeable)
+static inline int update_l2e(l2_pgentry_t *pl2e, 
+                             l2_pgentry_t  ol2e, 
+                             l2_pgentry_t  nl2e)
  {
-    struct pfn_info *page;
-    ASSERT(page_nr < max_page);
-    page = frame_table + page_nr;
-    ASSERT(DOMAIN_OKAY(page->flags));
-    ASSERT((!writeable) || 
-           ((page_type_count(page) != 0) && 
-            ((page->flags & PG_type_mask) == PGT_writeable_page) &&
-            ((page->flags & PG_need_flush) == PG_need_flush)));
-    if ( writeable )
-    {
-        put_page_type(page);
-    }
-    else if ( unlikely(((page->flags & PG_type_mask) == PGT_ldt_page) &&
-                       (page_type_count(page) != 0)) )
-    {
-        /* We expect this is rare so we just blow the entire shadow LDT. */
-        invalidate_shadow_ldt();
-    }
-    put_page_tot(page);
+    unsigned long o = cmpxchg((unsigned long *)pl2e, 
+                              l2_pgentry_val(ol2e), 
+                              l2_pgentry_val(nl2e));
+    if ( o != l2_pgentry_val(ol2e) )
+        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
+                l2_pgentry_val(ol2e), l2_pgentry_val(nl2e), o);
+    return (o == l2_pgentry_val(ol2e));
  }
  
  
-static int mod_l2_entry(l2_pgentry_t *p_l2_entry, l2_pgentry_t new_l2_entry)
+/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
+static int mod_l2_entry(l2_pgentry_t *pl2e, 
+                        l2_pgentry_t nl2e, 
+                        unsigned long pfn)
  {
-    l2_pgentry_t old_l2_entry = *p_l2_entry;
+    l2_pgentry_t ol2e;
+    unsigned long _ol2e;
  
-    if ( unlikely((((unsigned long)p_l2_entry & (PAGE_SIZE-1)) >> 2) >=
+    if ( unlikely((((unsigned long)pl2e & (PAGE_SIZE-1)) >> 2) >=
                    DOMAIN_ENTRIES_PER_L2_PAGETABLE) )
      {
-        MEM_LOG("Illegal L2 update attempt in hypervisor area %p",
-                p_l2_entry);
-        goto fail;
+        MEM_LOG("Illegal L2 update attempt in hypervisor area %p", pl2e);
+        return 0;
      }
  
-    if ( (l2_pgentry_val(new_l2_entry) & _PAGE_PRESENT) )
+    if ( unlikely(__get_user(_ol2e, (unsigned long *)pl2e) != 0) )
+        return 0;
+    ol2e = mk_l2_pgentry(_ol2e);
+
+    if ( l2_pgentry_val(nl2e) & _PAGE_PRESENT )
      {
-        if ( unlikely((l2_pgentry_val(new_l2_entry) & 
-                       (_PAGE_GLOBAL|_PAGE_PSE))) )
-        {
-            MEM_LOG("Bad L2 entry val %04lx",
-                    l2_pgentry_val(new_l2_entry) & 
-                    (_PAGE_GLOBAL|_PAGE_PSE));
-            goto fail;
-        }
          /* Differ in mapping (bits 12-31) or presence (bit 0)? */
-        if ( ((l2_pgentry_val(old_l2_entry) ^ 
-               l2_pgentry_val(new_l2_entry)) & 0xfffff001) != 0 )
+        if ( ((l2_pgentry_val(ol2e) ^ l2_pgentry_val(nl2e)) & ~0xffe) != 0 )
          {
-            /* Assume we're mapping an L1 table, falling back to twisted L2. */
-            if ( unlikely(get_l1_table(l2_pgentry_to_pagenr(new_l2_entry))) )
+            if ( unlikely(!get_page_from_l2e(nl2e, pfn)) )
+                return 0;
+
+            if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
              {
-                /* NB. No need to sanity-check the VA: done already. */
-                unsigned long l1e = l1_pgentry_val(
-                    linear_pg_table[(unsigned long)p_l2_entry >> PAGE_SHIFT]);
-                if ( get_twisted_l2_table(l1e >> PAGE_SHIFT, new_l2_entry) )
-                    goto fail;
+                put_page_from_l2e(nl2e, pfn);
+                return 0;
              }
  
-            if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) ) 
-                put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));            
-        } 
+            if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT )
+                put_page_from_l2e(ol2e, pfn);
+        }
+        else if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
+        {
+            return 0;
+        }
      }
-    else if ( (l2_pgentry_val(old_l2_entry) & _PAGE_PRESENT) )
+    else
      {
-        put_l1_table(l2_pgentry_to_pagenr(old_l2_entry));
+        if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
+            return 0;
+
+        if ( l2_pgentry_val(ol2e) & _PAGE_PRESENT )
+            put_page_from_l2e(ol2e, pfn);
      }
      
-    *p_l2_entry = new_l2_entry;
-    return 0;
-
- fail:
-    return -1;
+    return 1;
  }
  
  
-static int mod_l1_entry(l1_pgentry_t *p_l1_entry, l1_pgentry_t new_l1_entry)
+static inline int update_l1e(l1_pgentry_t *pl1e, 
+                             l1_pgentry_t  ol1e, 
+                             l1_pgentry_t  nl1e)
  {
-    l1_pgentry_t old_l1_entry = *p_l1_entry;
+    unsigned long o = l1_pgentry_val(ol1e);
+    unsigned long n = l1_pgentry_val(nl1e);
  
-    if ( (l1_pgentry_val(new_l1_entry) & _PAGE_PRESENT) )
+    while ( unlikely(cmpxchg_user(pl1e, o, n) != 0) )
      {
-        if ( unlikely((l1_pgentry_val(new_l1_entry) &
-                       (_PAGE_GLOBAL|_PAGE_PAT))) ) 
+        unsigned int cpu = smp_processor_id();
+        /* The CMPXCHG faulted -- maybe we need to clear the WP bit. */
+        if ( deferred_op[cpu].flags & DOP_RESTORE_CR0 )
          {
-            MEM_LOG("Bad L1 entry val %04lx",
-                    l1_pgentry_val(new_l1_entry) & 
-                    (_PAGE_GLOBAL|_PAGE_PAT));
-            goto fail;
+            MEM_LOG("cmpxchg fault despite WP bit cleared\n");
+            return 0;
          }
+        deferred_op[cpu].cr0 = read_cr0();
+        write_cr0(deferred_op[cpu].cr0 & ~X86_CR0_WP);
+        deferred_op[cpu].flags |= DOP_RESTORE_CR0;
+    }
+
+    if ( o != l1_pgentry_val(ol1e))
+        MEM_LOG("Failed to update %08lx -> %08lx: saw %08lx\n",
+                l1_pgentry_val(ol1e), l1_pgentry_val(nl1e), o);
+
+    /* The swap was successful if the old value we saw is equal to ol1e. */
+    return (o == l1_pgentry_val(ol1e));
+}
+
+
+/* Update the L1 entry at pl1e to new value nl1e. */
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e)
+{
+    l1_pgentry_t ol1e;
+    unsigned long _ol1e;
+
+    if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) )
+    {
+        MEM_LOG("Bad get_user\n");
+        return 0;
+    }
+    
+    ol1e = mk_l1_pgentry(_ol1e);
+
+    if ( l1_pgentry_val(nl1e) & _PAGE_PRESENT )
+    {
          /*
           * Differ in mapping (bits 12-31), writeable (bit 1), or
           * presence (bit 0)?
           */
-        if ( ((l1_pgentry_val(old_l1_entry) ^
-               l1_pgentry_val(new_l1_entry)) & 0xfffff003) != 0 )
+        if ( ((l1_pgentry_val(ol1e) ^ l1_pgentry_val(nl1e)) & ~0xffc) != 0 )
          {
-            if ( get_page(l1_pgentry_to_pagenr(new_l1_entry),
-                          l1_pgentry_val(new_l1_entry) & _PAGE_RW) )
-                goto fail;
-
-            if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) ) 
-                put_page(l1_pgentry_to_pagenr(old_l1_entry),
-                         l1_pgentry_val(old_l1_entry) & _PAGE_RW);
-        } 
+            if ( unlikely(!get_page_from_l1e(nl1e)) )
+                return 0;
+
+            if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+            {
+                put_page_from_l1e(nl1e);
+                return 0;
+            }
+
+            if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT )
+                put_page_from_l1e(ol1e);
+        }
+        else if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+        {
+            return 0;
+        }
+    }
+    else 
+    {
+        if ( unlikely(!update_l1e(pl1e, ol1e, nl1e)) )
+            return 0;
+
+        if ( l1_pgentry_val(ol1e) & _PAGE_PRESENT )
+            put_page_from_l1e(ol1e);
+    }
+
+    return 1;
+}
+
+
+int alloc_page_type(struct pfn_info *page, unsigned int type)
+{
+    if ( unlikely(test_and_clear_bit(_PGC_tlb_flush_on_type_change, 
+                                     &page->count_and_flags)) )
+    {
+        struct task_struct *p = page->u.domain;
+        mb(); /* Check zombie status before using domain ptr. */
+        /*
+         * NB. 'p' may no longer be valid by time we dereference it, so
+         * p->processor might be garbage. We clamp it, just in case.
+         */
+        if ( !test_bit(_PGC_zombie, &page->count_and_flags) &&
+             unlikely(NEED_FLUSH(tlbflush_time[(p->processor)&(NR_CPUS-1)], 
+                                 page->tlbflush_timestamp)) )
+        {
+            perfc_incr(need_flush_tlb_flush);
+            flush_tlb_cpu(p->processor);
+        }
      }
-    else if ( (l1_pgentry_val(old_l1_entry) & _PAGE_PRESENT) )
+
+    switch ( type )
      {
-        put_page(l1_pgentry_to_pagenr(old_l1_entry),
-                 l1_pgentry_val(old_l1_entry) & _PAGE_RW);
+    case PGT_l1_page_table:
+        return alloc_l1_table(page);
+    case PGT_l2_page_table:
+        return alloc_l2_table(page);
+    case PGT_gdt_page:
+    case PGT_ldt_page:
+        return alloc_segdesc_page(page);
+    default:
+        BUG();
      }
  
-    *p_l1_entry = new_l1_entry;
      return 0;
+}
  
- fail:
-    return -1;
+
+void free_page_type(struct pfn_info *page, unsigned int type)
+{
+    switch ( type )
+    {
+    case PGT_l1_page_table:
+        return free_l1_table(page);
+    case PGT_l2_page_table:
+        return free_l2_table(page);
+    default:
+        BUG();
+    }
  }
  
  
  static int do_extended_command(unsigned long ptr, unsigned long val)
  {
-    int err = 0, cpu = smp_processor_id();
+    int okay = 1, cpu = smp_processor_id();
      unsigned int cmd = val & MMUEXT_CMD_MASK;
      unsigned long pfn = ptr >> PAGE_SHIFT;
-    struct pfn_info *page = frame_table + pfn;
+    struct pfn_info *page = &frame_table[pfn];
  
      /* 'ptr' must be in range except where it isn't a machine address. */
      if ( (pfn >= max_page) && (cmd != MMUEXT_SET_LDT) )
+    {
+        MEM_LOG("Ptr out of range for extended MMU command");
          return 1;
+    }
  
      switch ( cmd )
      {
      case MMUEXT_PIN_L1_TABLE:
-        if ( unlikely(page->flags & PG_guest_pinned) )
+    case MMUEXT_PIN_L2_TABLE:
+        okay = get_page_and_type_from_pagenr(pfn, 
+                                             (cmd == MMUEXT_PIN_L2_TABLE) ? 
+                                             PGT_l2_page_table : 
+                                             PGT_l1_page_table);
+        if ( unlikely(!okay) )
          {
-            MEM_LOG("Pfn %08lx already pinned", pfn);
-            err = 1;
+            MEM_LOG("Error while pinning pfn %08lx", pfn);
              break;
          }
-        err = get_l1_table(pfn);
-        goto mark_as_pinned;
  
-    case MMUEXT_PIN_L2_TABLE:
-        if ( unlikely(page->flags & PG_guest_pinned) )
+        if ( unlikely(test_and_set_bit(_PGC_guest_pinned, 
+                                       &page->count_and_flags)) )
          {
              MEM_LOG("Pfn %08lx already pinned", pfn);
-            err = 1;
+            put_page_and_type(page);
+            okay = 0;
              break;
          }
-        err = get_l2_table(pfn);
  
-    mark_as_pinned:
-        if ( unlikely(err) )
-        {
-            MEM_LOG("Error while pinning pfn %08lx", pfn);
-            break;
-        }
-        page->flags |= PG_guest_pinned;
          break;
  
      case MMUEXT_UNPIN_TABLE:
-        if ( unlikely(!DOMAIN_OKAY(page->flags)) )
+        if ( unlikely(!(okay = get_page_from_pagenr(pfn))) )
          {
-            err = 1;
-            MEM_LOG("Page %08lx bad domain (dom=%ld)",
-                    ptr, page->flags & PG_domain_mask);
+            MEM_LOG("Page %08lx bad domain (dom=%p)",
+                    ptr, page->u.domain);
          }
-        else if ( likely(page->flags & PG_guest_pinned) )
+        else if ( likely(test_and_clear_bit(_PGC_guest_pinned, 
+                                            &page->count_and_flags)) )
          {
-            page->flags &= ~PG_guest_pinned;
-            ((page->flags & PG_type_mask) == PGT_l1_page_table) ?
-                put_l1_table(pfn) : put_l2_table(pfn);
+            put_page_and_type(page);
          }
          else
          {
-            err = 1;
+            okay = 0;
              MEM_LOG("Pfn %08lx not pinned", pfn);
          }
          break;
  
      case MMUEXT_NEW_BASEPTR:
-        err = get_l2_table(pfn);
-        if ( !err )
+        okay = get_page_and_type_from_pagenr(pfn, PGT_l2_page_table);
+        if ( likely(okay) )
          {
-            put_l2_table(pagetable_val(current->mm.pagetable) >> PAGE_SHIFT);
+            put_page_and_type(&frame_table[pagetable_val(current->mm.pagetable)
+                                          >> PAGE_SHIFT]);
              current->mm.pagetable = mk_pagetable(pfn << PAGE_SHIFT);
              invalidate_shadow_ldt();
-            deferred_op[cpu].flush_tlb = 1;
+            deferred_op[cpu].flags |= DOP_FLUSH_TLB;
          }
          else
          {
-            MEM_LOG("Error while installing new baseptr %08lx %d", ptr, err);
+            MEM_LOG("Error while installing new baseptr %08lx", ptr);
          }
          break;
          
      case MMUEXT_TLB_FLUSH:
-        deferred_op[cpu].flush_tlb = 1;
+        deferred_op[cpu].flags |= DOP_FLUSH_TLB;
          break;
      
      case MMUEXT_INVLPG:
@@ -815,7 +867,7 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
               ((ptr+ents*LDT_ENTRY_SIZE) < ptr) ||
               ((ptr+ents*LDT_ENTRY_SIZE) > PAGE_OFFSET) )
          {
-            err = 1;
+            okay = 0;
              MEM_LOG("Bad args to SET_LDT: ptr=%08lx, ents=%08lx", ptr, ents);
          }
          else if ( (current->mm.ldt_ents != ents) || 
@@ -825,37 +877,39 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
              current->mm.ldt_base = ptr;
              current->mm.ldt_ents = ents;
              load_LDT(current);
-            deferred_op[cpu].refresh_ldt = (ents != 0);
+            deferred_op[cpu].flags &= ~DOP_RELOAD_LDT;
+            if ( ents != 0 )
+                deferred_op[cpu].flags |= DOP_RELOAD_LDT;
          }
          break;
      }
  
      default:
          MEM_LOG("Invalid extended pt command 0x%08lx", val & MMUEXT_CMD_MASK);
-        err = 1;
+        okay = 0;
          break;
      }
  
-    return err;
+    return okay;
  }
  
  
  int do_mmu_update(mmu_update_t *ureqs, int count)
  {
      mmu_update_t req;
-    unsigned long flags, pfn, l1e;
+    unsigned long va = 0, flags, pfn, prev_pfn = 0;
      struct pfn_info *page;
-    int rc = 0, err = 0, i, cpu = smp_processor_id();
+    int rc = 0, okay = 1, i, cpu = smp_processor_id();
      unsigned int cmd;
-    unsigned long cr0 = 0;
  
-    perfc_incrc( calls_to_mmu_update ); 
-    perfc_addc( num_page_updates, count );
+    perfc_incrc(calls_to_mmu_update); 
+    perfc_addc(num_page_updates, count);
  
      for ( i = 0; i < count; i++ )
      {
          if ( unlikely(copy_from_user(&req, ureqs, sizeof(req)) != 0) )
          {
+            MEM_LOG("Bad copy_from_user");
              rc = -EFAULT;
              break;
          }
@@ -863,77 +917,85 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
          cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
          pfn = req.ptr >> PAGE_SHIFT;
  
-        err = 1;
-
-        spin_lock(&current->page_lock);
-
-        /* Get the page-frame number that a non-extended command references. */
-        if ( (cmd == MMU_NORMAL_PT_UPDATE) || 
-             (cmd == MMU_UNCHECKED_PT_UPDATE) )
-        {
-            if ( cr0 == 0 )
-            {
-                cr0 = read_cr0();
-                write_cr0(cr0 & ~X86_CR0_WP);
-            }
-            /* Need to use 'get_user' since the VA's PGD may be absent. */
-            __get_user(l1e, (unsigned long *)(linear_pg_table+pfn));
-            /* Now check that the VA's PTE isn't absent. */
-            if ( unlikely(!(l1e & _PAGE_PRESENT)) )
-            {
-                MEM_LOG("L1E n.p. at VA %08lx (%08lx)", req.ptr&~3, l1e);
-                goto unlock;
-            }
-            /* Finally, get the underlying machine address. */
-            pfn = l1e >> PAGE_SHIFT;
-        }
+        okay = 0;
  
-        /* Least significant bits of 'ptr' demux the operation type. */
          switch ( cmd )
          {
              /*
               * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
               */
          case MMU_NORMAL_PT_UPDATE:
-            page  = frame_table + pfn;
-            flags = page->flags;
+            page = &frame_table[pfn];
  
-            if ( likely(DOMAIN_OKAY(flags)) )
+            if ( unlikely(!get_page(page, current)) &&
+                 ((current->domain != 0) || !dom0_get_page(page)) )
              {
-                switch ( (flags & PG_type_mask) )
-                {
-                case PGT_l1_page_table: 
-                    err = mod_l1_entry((l1_pgentry_t *)req.ptr, 
-                                       mk_l1_pgentry(req.val)); 
-                    break;
-                case PGT_l2_page_table: 
-                    err = mod_l2_entry((l2_pgentry_t *)req.ptr, 
-                                       mk_l2_pgentry(req.val)); 
-                    break;                    
-                default:
-                    if ( page_type_count(page) == 0 )
-                    {
-                        *(unsigned long *)req.ptr = req.val;
-                        err = 0;
-                    }
-                    else
-                        MEM_LOG("Update to bad page %08lx", req.ptr);
-                    break;
-                }
+                MEM_LOG("Could not get page for normal update");
+                break;
+            }
+
+            if ( likely(prev_pfn == pfn) )
+            {
+                va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
              }
              else
              {
-                MEM_LOG("Bad domain normal update (dom %d, pfn %ld)",
-                        current->domain, pfn);
+                if ( prev_pfn != 0 )
+                    unmap_domain_mem((void *)va);
+                va = (unsigned long)map_domain_mem(req.ptr);
+                prev_pfn = pfn;
+            }
+
+            switch ( (page->type_and_flags & PGT_type_mask) )
+            {
+            case PGT_l1_page_table: 
+                if ( likely(get_page_type(page, PGT_l1_page_table)) )
+                {
+                    okay = mod_l1_entry((l1_pgentry_t *)va, 
+                                        mk_l1_pgentry(req.val)); 
+                    put_page_type(page);
+                }
+                break;
+            case PGT_l2_page_table:
+                if ( likely(get_page_type(page, PGT_l2_page_table)) )
+                {
+                    okay = mod_l2_entry((l2_pgentry_t *)va, 
+                                        mk_l2_pgentry(req.val),
+                                        pfn); 
+                    put_page_type(page);
+                }
+                break;
+            default:
+                if ( likely(get_page_type(page, PGT_writeable_page)) )
+                {
+                    *(unsigned long *)va = req.val;
+                    okay = 1;
+                    put_page_type(page);
+                }
+                break;
              }
+            
+            put_page(page);
+
              break;
  
          case MMU_UNCHECKED_PT_UPDATE:
              req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
              if ( likely(IS_PRIV(current)) )
              {
-                *(unsigned long *)req.ptr = req.val;
-                err = 0;
+                if ( likely(prev_pfn == pfn) )
+                {
+                    va = (va & PAGE_MASK) | (req.ptr & ~PAGE_MASK);
+                }
+                else
+                {
+                    if ( prev_pfn != 0 )
+                        unmap_domain_mem((void *)va);
+                    va = (unsigned long)map_domain_mem(req.ptr);
+                    prev_pfn = pfn;
+                }
+                *(unsigned long *)va = req.val;
+                okay = 1;
              }
              else
              {
@@ -942,21 +1004,18 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
              break;
              
          case MMU_MACHPHYS_UPDATE:
-            page = frame_table + pfn;
+            page = &frame_table[pfn];
              if ( unlikely(pfn >= max_page) )
              {
                  MEM_LOG("Page out of range (%08lx > %08lx)", pfn, max_page);
              }
-            else if ( likely(DOMAIN_OKAY(page->flags)) )
+            else if ( likely(get_page(page, current)) ||
+                      ((current->domain == 0) && dom0_get_page(page)) )
              {
                  machine_to_phys_mapping[pfn] = req.val;
-                err = 0;
+                okay = 1;
+                put_page(page);
              }
-            else
-            {
-                MEM_LOG("Bad domain MPT update (dom %d, pfn %ld)",
-                        current->domain, pfn);
-            }            
              break;
  
              /*
@@ -965,7 +1024,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
               */
          case MMU_EXTENDED_COMMAND:
              req.ptr &= ~(sizeof(l1_pgentry_t) - 1);
-            err = do_extended_command(req.ptr, req.val);
+            okay = do_extended_command(req.ptr, req.val);
              break;
  
          default:
@@ -973,10 +1032,7 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
              break;
          }
  
-    unlock:
-        spin_unlock(&current->page_lock);
-
-        if ( unlikely(err) )
+        if ( unlikely(!okay) )
          {
              rc = -EINVAL;
              break;
@@ -985,20 +1041,20 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
          ureqs++;
      }
  
-    if ( deferred_op[cpu].flush_tlb )
-    {
-        deferred_op[cpu].flush_tlb = 0;
-        __write_cr3_counted(pagetable_val(current->mm.pagetable));
-    }
+    if ( prev_pfn != 0 )
+        unmap_domain_mem((void *)va);
  
-    if ( deferred_op[cpu].refresh_ldt )
-    {
-        deferred_op[cpu].refresh_ldt = 0;
+    flags = deferred_op[cpu].flags;
+    deferred_op[cpu].flags = 0;
+
+    if ( flags & DOP_FLUSH_TLB )
+        write_cr3_counted(pagetable_val(current->mm.pagetable));
+
+    if ( flags & DOP_RELOAD_LDT )
          (void)map_ldt_shadow_page(0);
-    }
  
-    if ( cr0 != 0 )
-        write_cr0(cr0);
+    if ( unlikely(flags & DOP_RESTORE_CR0) )
+        write_cr0(deferred_op[cpu].cr0);
  
      return rc;
  }
@@ -1006,48 +1062,34 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
  
  int do_update_va_mapping(unsigned long page_nr, 
                           unsigned long val, 
-                         unsigned long flags)
+                         unsigned long caller_flags)
  {
-    unsigned long _x, cr0 = 0;
      struct task_struct *p = current;
-    int err = -EINVAL;
+    int err = 0;
+    unsigned int cpu = p->processor;
+    unsigned long defer_flags;
  
      if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
-        goto out;
-
-    spin_lock(&p->page_lock);
+        return -EINVAL;
  
-    /* Check that the VA's page-directory entry is present.. */
-    if ( unlikely((err = __get_user(_x, (unsigned long *)
-                                    (&linear_pg_table[page_nr]))) != 0) )
-        goto unlock_and_out;
-
-    /* If the VA's page-directory entry is read-only, we frob the WP bit. */
-    if ( unlikely(__put_user(_x, (unsigned long *)
-                             (&linear_pg_table[page_nr]))) )
-    {
-        cr0 = read_cr0();
-        write_cr0(cr0 & ~X86_CR0_WP);        
-    }
-
-    if ( unlikely(mod_l1_entry(&linear_pg_table[page_nr], 
-                               mk_l1_pgentry(val)) != 0) )
-    {
+    if ( unlikely(!mod_l1_entry(&linear_pg_table[page_nr], 
+                                mk_l1_pgentry(val))) )
          err = -EINVAL;
-        goto check_cr0_unlock_and_out;
-    }
  
-    if ( unlikely(flags & UVMF_INVLPG) )
+    defer_flags = deferred_op[cpu].flags;
+    deferred_op[cpu].flags = 0;
+
+    if ( unlikely(defer_flags & DOP_FLUSH_TLB) || 
+         unlikely(caller_flags & UVMF_FLUSH_TLB) )
+        write_cr3_counted(pagetable_val(p->mm.pagetable));
+    else if ( unlikely(caller_flags & UVMF_INVLPG) )
          __flush_tlb_one(page_nr << PAGE_SHIFT);
  
-    if ( unlikely(flags & UVMF_FLUSH_TLB) )
-        __write_cr3_counted(pagetable_val(p->mm.pagetable));
+    if ( unlikely(defer_flags & DOP_RELOAD_LDT) )
+        (void)map_ldt_shadow_page(0);
+
+    if ( unlikely(defer_flags & DOP_RESTORE_CR0) )
+        write_cr0(deferred_op[cpu].cr0);
  
- check_cr0_unlock_and_out:
-    if ( unlikely(cr0 != 0) )
-        write_cr0(cr0);
- unlock_and_out:
-    spin_unlock(&p->page_lock);
- out:
      return err;
  }
diff --git a/xen/common/network.c b/xen/common/network.c

index 02b6f57580d8f976d94aff5dfabf011162df9398..14bfa8dac5def52ba9ce5142420d48a0d8e14966 100644 (file)
--- a/xen/common/network.c
+++ b/xen/common/network.c
@@ -90,7 +90,7 @@ net_vif_t *create_net_vif(int domain)
      if ( sizeof(net_ring_t) > PAGE_SIZE ) BUG();
      new_ring = (net_ring_t *)get_free_page(GFP_KERNEL);
      clear_page(new_ring);
-    SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), domain);
+    SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p);
  
      /*
       * Fill in the new vif struct. Note that, while the vif's refcnt is
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c

index 9e227f574a50c9163c5a8313ec15ca59414eeb5e..ca609438e04e36bf0474828528710a0b00370994 100644 (file)
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -188,12 +188,12 @@ void __init init_page_allocator(unsigned long min, unsigned long max)
  /* Release a PHYSICAL address range to the allocator. */
  void release_bytes_to_allocator(unsigned long min, unsigned long max)
  {
-    min = round_pgup  (min) + PAGE_OFFSET;
-    max = round_pgdown(max) + PAGE_OFFSET;
+    min = round_pgup  (min);
+    max = round_pgdown(max);
  
      while ( min < max )
      {
-        __free_pages(min, 0);
+        __free_pages(min+PAGE_OFFSET, 0);
          min += PAGE_SIZE;
      }
  }
@@ -210,7 +210,6 @@ unsigned long __get_free_pages(int mask, int order)
  retry:
      spin_lock_irqsave(&alloc_lock, flags);
  
-
      /* Find smallest order which can satisfy the request. */
      for ( i = order; i < FREELIST_SIZE; i++ ) {
         if ( !FREELIST_EMPTY(free_head[i]) ) 
diff --git a/xen/drivers/block/ll_rw_blk.c b/xen/drivers/block/ll_rw_blk.c

index 55fbdf3e7944d166f26a547bca174f4185c0f44e..9e1b0de26600819cf13c3e014e254925ebcf877e 100644 (file)
--- a/xen/drivers/block/ll_rw_blk.c
+++ b/xen/drivers/block/ll_rw_blk.c
@@ -14,31 +14,15 @@
  #include <xeno/types.h>
  #include <xeno/lib.h>
  #include <xeno/sched.h>
-/*#include <xeno/kernel_stat.h>*/
  #include <xeno/errno.h>
-/*#include <xeno/locks.h>*/
  #include <xeno/mm.h>
-/*#include <xeno/swap.h>*/
  #include <xeno/init.h>
-/*#include <xeno/smp_lock.h>*/
-/*#include <xeno/completion.h>*/
-
  #include <asm/system.h>
  #include <asm/io.h>
  #include <xeno/blk.h>
-/*#include <xeno/highmem.h>*/
  #include <xeno/slab.h>
  #include <xeno/module.h>
  
-/*
- * KAF: We can turn off noise relating to barking guest-OS requests.
- */
-#if 0
-#define DPRINTK(_f, _a...) printk(_f , ## _a)
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
  /* This will die as all synchronous stuff is coming to an end */
  #if 0 
  #define complete(_r) panic("completion.h stuff may be needed...")
@@ -47,8 +31,6 @@
  #define complete(_r) (*(int *)(_r) = 0)
  #endif
  
-
-
  /*
   * MAC Floppy IWM hooks
   */
diff --git a/xen/drivers/block/xen_block.c b/xen/drivers/block/xen_block.c

index 5103d85ffdf7797a8e4d178c7f0e755faaf51f5e..8b1cb119e6e0fb5e4fe034ac98b3534b78a9cab2 100644 (file)
--- a/xen/drivers/block/xen_block.c
+++ b/xen/drivers/block/xen_block.c
@@ -20,12 +20,6 @@
  #include <xeno/vbd.h>
  #include <xeno/slab.h>
  
-#if 0
-#define DPRINTK(_f, _a...) printk( _f , ## _a )
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
  /*
   * These are rather arbitrary. They are fairly large because adjacent
   * requests pulled from a communication ring are quite likely to end
@@ -60,15 +54,11 @@ static atomic_t nr_pending;
  
  static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
  
-static int __buffer_is_valid(struct task_struct *p, 
-                             unsigned long buffer, 
-                             unsigned short size,
-                             int writeable_buffer);
-static void __lock_buffer(unsigned long buffer,
-                          unsigned short size,
-                          int writeable_buffer);
-static void unlock_buffer(struct task_struct *p,
-                          unsigned long buffer,
+static int lock_buffer(struct task_struct *p,
+                       unsigned long buffer,
+                       unsigned short size,
+                       int writeable_buffer);
+static void unlock_buffer(unsigned long buffer,
                            unsigned short size,
                            int writeable_buffer);
  
@@ -185,8 +175,7 @@ static void end_block_io_op_softirq(struct softirq_action *h)
      {
          pending_req = bh->pending_req;
          
-        unlock_buffer(pending_req->domain, 
-                      virt_to_phys(bh->b_data), 
+        unlock_buffer(virt_to_phys(bh->b_data), 
                        bh->b_size, 
                        (pending_req->operation==READ));
          
@@ -321,97 +310,60 @@ long do_block_io_op(block_io_op_t *u_block_io_op)
   * DOWNWARD CALLS -- These interface with the block-device layer proper.
   */
  
-static int __buffer_is_valid(struct task_struct *p, 
-                             unsigned long buffer, 
-                             unsigned short size,
-                             int writeable_buffer)
+static int lock_buffer(struct task_struct *p,
+                       unsigned long buffer,
+                       unsigned short size,
+                       int writeable_buffer)
  {
      unsigned long    pfn;
      struct pfn_info *page;
-    int rc = 0;
  
-    /* A request may span multiple page frames. Each must be checked. */
      for ( pfn = buffer >> PAGE_SHIFT; 
            pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
            pfn++ )
      {
-        /* Each frame must be within bounds of machine memory. */
-        if ( pfn >= max_page )
-        {
-            DPRINTK("pfn out of range: %08lx\n", pfn);
-            goto out;
-        }
+        if ( unlikely(pfn >= max_page) )
+            goto fail;
  
-        page = frame_table + pfn;
+        page = &frame_table[pfn];
  
-        /* Each frame must belong to the requesting domain. */
-        if ( (page->flags & PG_domain_mask) != p->domain )
-        {
-            DPRINTK("bad domain: expected %d, got %ld\n", 
-                    p->domain, page->flags & PG_domain_mask);
-            goto out;
-        }
+        if ( unlikely(!get_page(page, p)) )
+            goto fail;
  
-        /* If reading into the frame, the frame must be writeable. */
-        if ( writeable_buffer &&
-             ((page->flags & PG_type_mask) != PGT_writeable_page) &&
-             (page_type_count(page) != 0) )
+        if ( writeable_buffer && 
+             unlikely(!get_page_type(page, PGT_writeable_page)) )
          {
-            DPRINTK("non-writeable page passed for block read\n");
-            goto out;
+            put_page(page);
+            goto fail;
          }
-    }    
-
-    rc = 1;
- out:
-    return rc;
-}
+    }
  
-static void __lock_buffer(unsigned long buffer,
-                          unsigned short size,
-                          int writeable_buffer)
-{
-    unsigned long    pfn;
-    struct pfn_info *page;
+    return 1;
  
-    for ( pfn = buffer >> PAGE_SHIFT; 
-          pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
-          pfn++ )
-    {
-        page = frame_table + pfn;
+ fail:
+    while ( pfn-- > (buffer >> PAGE_SHIFT) )
+    {        
          if ( writeable_buffer )
-        {
-            if ( page_type_count(page) == 0 )
-            {
-                page->flags &= ~PG_type_mask;
-                /* No need for PG_need_flush here. */
-                page->flags |= PGT_writeable_page;
-            }
-            get_page_type(page);
-        }
-        get_page_tot(page);
+            put_page_type(&frame_table[pfn]);
+        put_page(&frame_table[pfn]);
      }
+    return 0;
  }
  
-static void unlock_buffer(struct task_struct *p,
-                          unsigned long buffer,
+static void unlock_buffer(unsigned long buffer,
                            unsigned short size,
                            int writeable_buffer)
  {
-    unsigned long    pfn;
-    struct pfn_info *page;
+    unsigned long pfn;
  
-    spin_lock(&p->page_lock);
      for ( pfn = buffer >> PAGE_SHIFT; 
            pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
            pfn++ )
      {
-        page = frame_table + pfn;
          if ( writeable_buffer )
-            put_page_type(page);
-        put_page_tot(page);
+            put_page_type(&frame_table[pfn]);
+        put_page(&frame_table[pfn]);
      }
-    spin_unlock(&p->page_lock);
  }
  
  static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
@@ -480,8 +432,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
      int new_segs, nr_psegs = 0;
      phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
  
-    spin_lock(&p->page_lock);
-
      /* Check that number of segments is sane. */
      if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
      {
@@ -506,7 +456,7 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
              goto bad_descriptor;
          }
  
-        if ( !__buffer_is_valid(p, buffer, nr_sects<<9, (operation==READ)) )
+        if ( !lock_buffer(p, buffer, nr_sects<<9, (operation==READ)) )
         {
              DPRINTK("invalid buffer\n");
              goto bad_descriptor;
@@ -530,6 +480,7 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
                          req->sector_number + tot_sects, 
                          req->sector_number + tot_sects + nr_sects, 
                          req->device); 
+                unlock_buffer(buffer, nr_sects<<9, (operation==READ));
                  goto bad_descriptor;
              }
  
@@ -546,12 +497,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
          if ( nr_psegs >= (MAX_BLK_SEGS*2) ) BUG();
      }
  
-    /* Lock pages associated with each buffer head. */
-    for ( i = 0; i < nr_psegs; i++ )
-        __lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9, 
-                      (operation==READ));
-    spin_unlock(&p->page_lock);
-
      atomic_inc(&nr_pending);
      pending_req = pending_reqs + pending_ring[pending_cons];
      PENDREQ_IDX_INC(pending_cons);
@@ -594,7 +539,6 @@ static void dispatch_rw_block_io(struct task_struct *p, int index)
      return;
  
   bad_descriptor:
-    spin_unlock(&p->page_lock);
      make_response(p, req->id, req->operation, 1);
  } 
  
@@ -670,7 +614,7 @@ void init_blkdev_info(struct task_struct *p)
      if ( sizeof(*p->blk_ring_base) > PAGE_SIZE ) BUG();
      p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
      clear_page(p->blk_ring_base);
-    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p->domain);
+    SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p);
      p->blkdev_list.next = NULL;
      spin_lock_init(&p->vbd_lock);
  }
@@ -680,7 +624,6 @@ void destroy_blkdev_info(struct task_struct *p)
  {
      ASSERT(!__on_blkdev_list(p));
      UNSHARE_PFN(virt_to_page(p->blk_ring_base));
-    free_page((unsigned long)p->blk_ring_base);
      destroy_all_vbds(p);
  }
  
diff --git a/xen/drivers/block/xen_vbd.c b/xen/drivers/block/xen_vbd.c

index f16adb6795bea5b983dea7417ffa6f5c2fa690d0..13da02d03c05d4f357cd3ad56b947d690003b227 100644 (file)
--- a/xen/drivers/block/xen_vbd.c
+++ b/xen/drivers/block/xen_vbd.c
@@ -23,13 +23,6 @@
  extern int ide_probe_devices(xen_disk_info_t *xdi);
  extern int scsi_probe_devices(xen_disk_info_t *xdi);
  
-
-#if 0
-#define DPRINTK(_f, _a...) printk( _f , ## _a )
-#else
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
  /* XXX SMH: crappy 'hash function' .. fix when care. */
  #define HSH(_x) ((_x) & (VBD_HTAB_SZ - 1))
  
@@ -447,16 +440,9 @@ long vbd_probe(vbd_probe_t *probe)
      if ( (probe->domain == VBD_PROBE_ALL) || IS_PRIV(p) )
      { 
          /* Privileged domains always get access to the 'real' devices. */
-        if ( (ret = ide_probe_devices(&probe->xdi)) != 0 ) 
-        {
-            DPRINTK("vbd_probe: error %d in probing ide devices\n", ret); 
+        if ( ((ret = ide_probe_devices(&probe->xdi)) != 0) ||
+             ((ret = scsi_probe_devices(&probe->xdi)) != 0) )
              goto out; 
-        }
-        if ( (ret = scsi_probe_devices(&probe->xdi)) != 0 )
-        { 
-            DPRINTK("vbd_probe: error %d in probing scsi devices\n", ret); 
-            goto out; 
-        }
      } 
  
      if ( probe->domain == VBD_PROBE_ALL )
@@ -469,8 +455,6 @@ long vbd_probe(vbd_probe_t *probe)
              { 
                  if( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
                  { 
-                    DPRINTK("vbd_probe: error %d in probing virtual devices\n",
-                            ret); 
                      read_unlock_irqrestore(&tasklist_lock, flags);
                      goto out; 
                  }
@@ -478,17 +462,12 @@ long vbd_probe(vbd_probe_t *probe)
          }
          read_unlock_irqrestore(&tasklist_lock, flags);
      } 
-    else 
-    { 
-        if ( (ret = vbd_probe_devices(&probe->xdi, p)) )
-        { 
-            DPRINTK("vbd_probe: error %d in probing virtual devices\n", ret); 
-            goto out; 
-        }
-
-    }
+    else if ( (ret = vbd_probe_devices(&probe->xdi, p)) != 0 )
+        goto out; 
  
   out: 
+    if ( ret != 0 )
+        DPRINTK("vbd_probe: err %ld in probing virtual devices\n", ret); 
      if ( p != NULL )
          put_task_struct(p); 
      return ret; 
diff --git a/xen/drivers/net/e1000/e1000_main.c b/xen/drivers/net/e1000/e1000_main.c

index 4d88a61465185838c89830ebdffd1e4c6d337a5f..f6f5bb7aa80a7bf806ddc56641783d551fa8b874 100644 (file)
--- a/xen/drivers/net/e1000/e1000_main.c
+++ b/xen/drivers/net/e1000/e1000_main.c
@@ -1816,10 +1816,12 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  static void
  e1000_tx_timeout(struct net_device *netdev)
  {
+#if 0
         struct e1000_adapter *adapter = netdev->priv;
  
         /* Do the reset outside of interrupt context */
-       //schedule_work(&adapter->tx_timeout_task);
+       schedule_work(&adapter->tx_timeout_task);
+#endif
         e1000_tx_timeout_task(netdev); // XXXX HACK!!! XEN
  }
  
diff --git a/xen/include/asm-i386/atomic.h b/xen/include/asm-i386/atomic.h

index 70a1212ed65bea460957fdf506933595966c7509..9dcdca93f7bccbaac226f76f086996aaa38a38d5 100644 (file)
--- a/xen/include/asm-i386/atomic.h
+++ b/xen/include/asm-i386/atomic.h
@@ -186,15 +186,6 @@ static __inline__ int atomic_add_negative(int i, atomic_t *v)
         return c;
  }
  
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr) \
-__asm__ __volatile__(LOCK "andl %0,%1" \
-: : "r" (~(mask)),"m" (*addr) : "memory")
-
-#define atomic_set_mask(mask, addr) \
-__asm__ __volatile__(LOCK "orl %0,%1" \
-: : "r" (mask),"m" (*addr) : "memory")
-
  /* Atomic operations are already serializing on x86 */
  #define smp_mb__before_atomic_dec()    barrier()
  #define smp_mb__after_atomic_dec()     barrier()
diff --git a/xen/include/asm-i386/flushtlb.h b/xen/include/asm-i386/flushtlb.h

index 3a063fc0cbfbd7123df0aa90af04d152c516ca1a..e6f61cb521e5314bf4e79d36eba9f0f812a3af45 100644 (file)
--- a/xen/include/asm-i386/flushtlb.h
+++ b/xen/include/asm-i386/flushtlb.h
@@ -1,40 +1,39 @@
  /******************************************************************************
   * flushtlb.h
   * 
- * TLB flush macros that count flushes.  Counting is used to enforce 
- * zero-copy safety, particularily for the network code.
- *
- * akw - Jan 21, 2003
+ * TLB flushes are timestamped using a global virtual 'clock' which ticks
+ * on any TLB flush on any processor.
+ * 
+ * Copyright (c) 2003, K A Fraser
   */
  
-#ifndef __FLUSHTLB_H
-#define __FLUSHTLB_H
+#ifndef __FLUSHTLB_H__
+#define __FLUSHTLB_H__
  
  #include <xeno/smp.h>
-#include <asm/atomic.h>
-
-atomic_t tlb_flush_count[NR_CPUS];
-
-#define __write_cr3_counted(__pa)                                       \
-    do {                                                                \
-                __asm__ __volatile__ (                                  \
-                        "movl %0, %%cr3;"                               \
-                        :: "r" (__pa)                                   \
-                        : "memory");                                    \
-                atomic_inc(&tlb_flush_count[smp_processor_id()]);       \
-    } while (0)
-
-#define __flush_tlb_counted()                                           \
-        do {                                                            \
-                unsigned int tmpreg;                                    \
-                                                                        \
-                __asm__ __volatile__(                                   \
-                        "movl %%cr3, %0;  # flush TLB \n"               \
-                        "movl %0, %%cr3;                "               \
-                        : "=r" (tmpreg)                                 \
-                        :: "memory");                                   \
-                atomic_inc(&tlb_flush_count[smp_processor_id()]);       \
-        } while (0)
-
-#endif
-                           
+
+/*
+ * Every GLOBAL_FLUSH_PERIOD ticks of the tlbflush clock, every TLB in the
+ * system is guaranteed to have been flushed.
+ */
+#define GLOBAL_FLUSH_PERIOD (1<<16)
+
+/*
+ * '_cpu_stamp' is the current timestamp for the CPU we are testing.
+ * '_lastuse_stamp' is a timestamp taken when the PFN we are testing was last 
+ * used for a purpose that may have caused the CPU's TLB to become tainted.
+ */
+#define NEED_FLUSH(_cpu_stamp, _lastuse_stamp) \
+ (((_cpu_stamp) > (_lastuse_stamp)) ||         \
+  (((_lastuse_stamp) - (_cpu_stamp)) > (2*GLOBAL_FLUSH_PERIOD)))
+
+extern unsigned long tlbflush_mask;
+extern unsigned long tlbflush_clock;
+extern unsigned long tlbflush_time[NR_CPUS];
+
+extern void new_tlbflush_clock_period(void);
+
+extern void write_cr3_counted(unsigned long pa);
+extern void flush_tlb_counted(void);
+
+#endif /* __FLUSHTLB_H__ */
diff --git a/xen/include/asm-i386/io.h b/xen/include/asm-i386/io.h

index 9b54ae278de726d8025fab0a29bdb5291b8a77de..1bae91a1e2ab476e638ce14c8d58b55e2ca0d893 100644 (file)
--- a/xen/include/asm-i386/io.h
+++ b/xen/include/asm-i386/io.h
@@ -36,10 +36,9 @@ static inline void * phys_to_virt(unsigned long address)
         return __va(address);
  }
  
-/*
- * Change "struct page" to physical address.
- */
-#define page_to_phys(page)     ((page - frame_table) << PAGE_SHIFT)
+#define page_to_pfn(_page)  ((unsigned long)((_page) - frame_table))
+#define page_to_phys(_page) (page_to_pfn(_page) << PAGE_SHIFT)
+#define page_to_virt(_page) phys_to_virt(page_to_phys(_page))
  
  extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags);
  
diff --git a/xen/include/asm-i386/page.h b/xen/include/asm-i386/page.h

index c9191c43ebe813d4b16b542a7301ec33e90698d6..2fc1c43ef05cf9204e7ec7b3e1740ab90a30b0a2 100644 (file)
--- a/xen/include/asm-i386/page.h
+++ b/xen/include/asm-i386/page.h
@@ -92,7 +92,7 @@ typedef struct { unsigned long pt_lo; } pagetable_t;
  extern l2_pgentry_t idle_pg_table[ENTRIES_PER_L2_PAGETABLE];
  extern void paging_init(void);
  
-#define __flush_tlb() __flush_tlb_counted()
+#define __flush_tlb() flush_tlb_counted()
  
  /* Flush global pages as well. */
  
@@ -111,10 +111,10 @@ extern void paging_init(void);
          } while (0)
  
  
-#define __flush_tlb_all()                                              \
+#define __flush_tlb_pge()                                              \
         do {                                                            \
                  __pge_off();                                            \
-               __flush_tlb_counted();                                  \
+               flush_tlb_counted();                                    \
                  __pge_on();                                             \
         } while (0)
  
diff --git a/xen/include/asm-i386/pgalloc.h b/xen/include/asm-i386/pgalloc.h

index 841e5fd4a1b6bbd2ccaf2aa33e549a0a0534df80..88e906464119617448db34f527a3b95f1cdb65fe 100644 (file)
--- a/xen/include/asm-i386/pgalloc.h
+++ b/xen/include/asm-i386/pgalloc.h
@@ -47,28 +47,24 @@
  
  #ifndef CONFIG_SMP
  
-#define flush_tlb()         __flush_tlb()
-#define flush_tlb_all()     __flush_tlb_all()
-#define local_flush_tlb()   __flush_tlb()
-#define flush_tlb_cpu(_cpu) __flush_tlb()
+#define flush_tlb()           __flush_tlb()
+#define flush_tlb_all()       __flush_tlb()
+#define flush_tlb_all_pge()   __flush_tlb_pge()
+#define local_flush_tlb()     __flush_tlb()
+#define flush_tlb_cpu(_cpu)   __flush_tlb()
+#define flush_tlb_mask(_mask) __flush_tlb()
  
  #else
  
  #include <xeno/smp.h>
  
+extern void flush_tlb_mask(unsigned long mask);
+extern void flush_tlb_all_pge(void);
+
  #define flush_tlb()        __flush_tlb()
+#define flush_tlb_all()     flush_tlb_mask((1 << smp_num_cpus) - 1)
  #define local_flush_tlb()   __flush_tlb()
-
-extern void flush_tlb_all(void);
-
-extern void flush_tlb_others(unsigned long cpumask);
-static inline void flush_tlb_cpu(unsigned int cpu)
-{
-    if ( cpu == smp_processor_id() )
-        __flush_tlb();
-    else
-        flush_tlb_others(1<<cpu);
-}
+#define flush_tlb_cpu(_cpu) flush_tlb_mask(1 << (_cpu))
  
  #endif
  
diff --git a/xen/include/asm-i386/smp.h b/xen/include/asm-i386/smp.h

index cfec568c43883ea58e0b6f3e4f5fdab8db264c9e..08eef3c8bdea3eac4280f74412dd6c2bd44fb856 100644 (file)
--- a/xen/include/asm-i386/smp.h
+++ b/xen/include/asm-i386/smp.h
@@ -1,15 +1,8 @@
  #ifndef __ASM_SMP_H
  #define __ASM_SMP_H
  
-#ifndef __ASSEMBLY__
  #include <xeno/config.h>
  #include <asm/ptrace.h>
-#include <asm/fixmap.h>
-#include <asm/bitops.h>
-#include <asm/mpspec.h>
-#include <asm/io_apic.h>
-#include <asm/apic.h>
-#endif
  
  #ifdef CONFIG_SMP
  #define TARGET_CPUS cpu_online_map
@@ -18,8 +11,6 @@
  #endif
  
  #ifdef CONFIG_SMP
-#ifndef __ASSEMBLY__
-
  /*
   * Private routines/data
   */
@@ -74,6 +65,9 @@ extern void smp_store_cpu_info(int id);               /* Store per CPU info (like the initial
  
  #define smp_processor_id() (current->processor)
  
+#include <asm/fixmap.h>
+#include <asm/apic.h>
+
  static __inline int hard_smp_processor_id(void)
  {
         /* we don't want to mark this access volatile - bad code generation */
@@ -86,7 +80,5 @@ static __inline int logical_smp_processor_id(void)
         return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
  }
  
-#endif /* !__ASSEMBLY__ */
-
  #endif
  #endif
diff --git a/xen/include/asm-i386/spinlock.h b/xen/include/asm-i386/spinlock.h

index 59dc7b209ff6f1a6cc366b9a6c9e1f2c80fac67a..9a4fc8573da8aaa67cdea4df61019e48c7ef3946 100644 (file)
--- a/xen/include/asm-i386/spinlock.h
+++ b/xen/include/asm-i386/spinlock.h
@@ -1,11 +1,10 @@
  #ifndef __ASM_SPINLOCK_H
  #define __ASM_SPINLOCK_H
  
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
-#include <asm/page.h>
  #include <xeno/config.h>
  #include <xeno/lib.h>
+#include <asm/atomic.h>
+#include <asm/rwlock.h>
  
  #if 0
  #define SPINLOCK_DEBUG 1
diff --git a/xen/include/asm-i386/system.h b/xen/include/asm-i386/system.h

index dc4ac3398b824f584468590d0afc071fb4550199..3e85277d6c1d162ac22372e51d3573b7de672220 100644 (file)
--- a/xen/include/asm-i386/system.h
+++ b/xen/include/asm-i386/system.h
@@ -93,7 +93,34 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
  #define cmpxchg(ptr,o,n)\
         ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
                                         (unsigned long)(n),sizeof(*(ptr))))
-    
+
+
+/*
+ * This function causes longword _o to be changed to _n at location _p.
+ * If this access causes a fault then we return 1, otherwise we return 0.
+ * If no fault occurs then _o is updated to teh value we saw at _p. If this
+ * is the same as the initial value of _o then _n is written to location _p.
+ */
+#define cmpxchg_user(_p,_o,_n)                                          \
+({                                                                      \
+    int _rc;                                                            \
+    __asm__ __volatile__ (                                              \
+        "1: " LOCK_PREFIX "cmpxchgl %2,%3\n"                            \
+        "2:\n"                                                          \
+        ".section .fixup,\"ax\"\n"                                      \
+        "3:     movl $1,%1\n"                                           \
+        "       jmp 2b\n"                                               \
+        ".previous\n"                                                   \
+        ".section __ex_table,\"a\"\n"                                   \
+        "       .align 4\n"                                             \
+        "       .long 1b,3b\n"                                          \
+        ".previous"                                                     \
+        : "=a" (_o), "=r" (_rc)                                         \
+        : "q" (_n), "m" (*__xg((volatile void *)_p)), "0" (_o), "1" (0) \
+        : "memory");                                                    \
+    _rc;                                                                \
+})
+
  /*
   * Force strict CPU ordering.
   * And yes, this is required on UP too when we're talking
diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h

index 045e4ad70e35dfb26402db0d14e7e94284ccf5f4..e1d20ff2c7fe24c3cc72c8bed0cd605f40d3ed53 100644 (file)
--- a/xen/include/hypervisor-ifs/dom0_ops.h
+++ b/xen/include/hypervisor-ifs/dom0_ops.h
@@ -141,8 +141,8 @@ typedef struct dom0_getpageframeinfo_st
  {
      /* IN variables. */
      unsigned long pfn;          /* Machine page frame number to query.       */
-    /* OUT variables. */
      unsigned int domain;        /* To which domain does the frame belong?    */
+    /* OUT variables. */
      enum { NONE, L1TAB, L2TAB } type; /* Is the page PINNED to a type?       */
  } dom0_getpageframeinfo_t;
  
diff --git a/xen/include/hypervisor-ifs/hypervisor-if.h b/xen/include/hypervisor-ifs/hypervisor-if.h

index 5bd13dba9b599e2034f0b72897db5bd61eef9c19..145b1a0aacc174748d677e4de6588425539d6514 100644 (file)
--- a/xen/include/hypervisor-ifs/hypervisor-if.h
+++ b/xen/include/hypervisor-ifs/hypervisor-if.h
@@ -125,9 +125,9 @@
   *  which shifts the least bits out.
   */
  /* A normal page-table update request. */
-#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is VA.      */
+#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.      */
  /* DOM0 can make entirely unchecked updates which do not affect refcnts. */
-#define MMU_UNCHECKED_PT_UPDATE  1 /* unchecked '*ptr = val'. ptr is VA.    */
+#define MMU_UNCHECKED_PT_UPDATE  1 /* unchecked '*ptr = val'. ptr is MA.    */
  /* Update an entry in the machine->physical mapping table. */
  #define MMU_MACHPHYS_UPDATE      2 /* ptr = MA of frame to modify entry for */
  /* An extended command. */
diff --git a/xen/include/xeno/config.h b/xen/include/xeno/config.h

index 64a99f66ce20e327b2f7028d4a7ac7f35cf5b8c6..c88e41d15b31b4ffe5ade2d385046de9b3cb5dd6 100644 (file)
--- a/xen/include/xeno/config.h
+++ b/xen/include/xeno/config.h
@@ -145,6 +145,13 @@
  
  #define capable(_c) 0
  
+#ifndef NDEBUG
+#define DPRINTK(_f, _a...) printk("(file=%s, line=%d) " _f, \
+                           __FILE__, __LINE__, ## _a)
+#else
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
  #ifndef __ASSEMBLY__
  
  #include <xeno/compiler.h>
diff --git a/xen/include/xeno/mm.h b/xen/include/xeno/mm.h

index 8f0c0323670ac8dcc1f5c3ba72bc16a51cbad880..d5c3c5d6cb06cebda7181ced167be16305f3b48b 100644 (file)
--- a/xen/include/xeno/mm.h
+++ b/xen/include/xeno/mm.h
@@ -3,34 +3,35 @@
  #define __XENO_MM_H__
  
  #include <xeno/config.h>
+#include <xeno/list.h>
+#include <xeno/spinlock.h>
+#include <xeno/perfc.h>
+#include <xeno/sched.h>
+
+#include <asm/pgalloc.h>
  #include <asm/atomic.h>
  #include <asm/desc.h>
-#include <xeno/list.h>
+#include <asm/flushtlb.h>
+#include <asm/io.h>
+
  #include <hypervisor-ifs/hypervisor-if.h>
-#include <xeno/spinlock.h>
  
-/* XXX KAF: These may die eventually, but so many refs in slab.c :((( */
+/*
+ * These are for compatibility with calls to the Linux memory allocators.
+ */
  
-/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */
  #define __GFP_DMA       0x01
-
-/* Action modifiers - doesn't change the zoning */
+#define GFP_DMA         __GFP_DMA
  #define __GFP_WAIT      0x10    /* Can wait and reschedule? */
  #define __GFP_HIGH      0x20    /* Should access emergency pools? */
  #define __GFP_IO        0x40    /* Can start low memory physical IO? */
  #define __GFP_HIGHIO    0x80    /* Can start high mem physical IO? */
  #define __GFP_FS        0x100   /* Can call down to low-level FS? */
-
  #define GFP_ATOMIC      (__GFP_HIGH)
-#define GFP_KERNEL      (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-
-/* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
-   platforms, used as appropriate on others */
+#define GFP_KERNEL      (__GFP_HIGH | __GFP_WAIT | __GFP_IO | \
+                         __GFP_HIGHIO | __GFP_FS)
  
-#define GFP_DMA         __GFP_DMA
-
-
-/******************************************************************************
+/*
   * The following is for page_alloc.c.
   */
  
@@ -44,95 +45,80 @@ void __free_pages(unsigned long p, int order);
  #define free_page(_p) (__free_pages(_p,0))
  
  
-/******************************************************************************
- * The following is the array of page info. One entry per page owned
- * by the hypervisor, indexed from `mem_map', just like Linux.
- *
- * 12.11.02. We no longer use struct page or mem_map, these are replaced
- * with struct pfn_info and frame_table respectively. Boris Dragovic
- */
-
-typedef struct pfn_info {
-    struct list_head list;      /* ->mapping has some page lists. */
-    unsigned long flags;        /* atomic flags. */
-    unsigned long tot_count;    /* Total domain usage count. */
-    unsigned long type_count;   /* pagetable/dir, or domain-writeable refs. */
-} frame_table_t;
-
-#define get_page_tot(p)                 ((p)->tot_count++)
-#define put_page_tot(p)                 \
-    ({ ASSERT((p)->tot_count != 0); --(p)->tot_count; })
-#define page_tot_count(p)       ((p)->tot_count)
-#define set_page_tot_count(p,v)  ((p)->tot_count = v)
-
-#define get_page_type(p)        ((p)->type_count++)
-#define put_page_type(p)        \
-    ({ ASSERT((p)->type_count != 0); --(p)->type_count; })
-#define page_type_count(p)      ((p)->type_count)
-#define set_page_type_count(p,v) ((p)->type_count = v)
-
-#define PG_domain_mask MAX_DOMAIN_ID /* owning domain (16 bits) */
-/* hypervisor flags (domain == 0) */
-#define PG_slab               24
-/* domain flags (domain != 0) */
  /*
- * NB. The following page types are MUTUALLY EXCLUSIVE.
- * At most one can be true at any point, and 'type_count' counts how many
- * references exist of the current type. A change in type can only occur
- * when type_count == 0.
+ * Per-page-frame information.
   */
-#define PG_type_mask        (15<<24) /* bits 24-27 */
-#define PGT_none            (0<<24) /* no special uses of this page */
-#define PGT_l1_page_table   (1<<24) /* using this page as an L1 page table? */
-#define PGT_l2_page_table   (2<<24) /* using this page as an L2 page table? */
-#define PGT_l3_page_table   (3<<24) /* using this page as an L3 page table? */
-#define PGT_l4_page_table   (4<<24) /* using this page as an L4 page table? */
-#define PGT_gdt_page        (5<<24) /* using this page in a GDT? */
-#define PGT_ldt_page        (6<<24) /* using this page in an LDT? */
-#define PGT_writeable_page  (7<<24) /* has writable mappings of this page? */
  
-/*
- * This bit indicates that the TLB must be flushed when the type count of this
- * frame drops to zero. This is needed on current x86 processors only for
- * frames which have guestos-accessible writeable mappings. In this case we
- * must prevent stale TLB entries allowing the frame to be written if it used
- * for a page table, for example.
- * 
- * We have this bit because the writeable type is actually also used to pin a
- * page when it is used as a disk read buffer. This doesn't require a TLB flush
- * because the frame never has a mapping in the TLB.
- */
-#define PG_need_flush       (1<<28)
+struct pfn_info
+{
+    /* Each frame can be threaded onto a doubly-linked list. */
+    struct list_head list;
+    /* The following possible uses are context-dependent. */
+    union {
+        /* Page is in use and not a zombie: we keep a pointer to its owner. */
+        struct task_struct *domain;
+        /* Page is not currently allocated: mask of possibly-tainted TLBs. */
+        unsigned long cpu_mask;
+        /* Page is a zombie: this word currently has no use. */
+        unsigned long _unused;
+    } u;
+    /* Reference count and various PGC_xxx flags and fields. */
+    unsigned long       count_and_flags;
+    /* Type reference count and various PGT_xxx flags and fields. */
+    unsigned long       type_and_flags;
+    /* Timestamp from 'TLB clock', used to reduce need for safety flushes. */
+    unsigned long       tlbflush_timestamp;
+};
  
-/*
- * This bit indicates that the guest OS has pinned the page to its current
- * type. For page tables this can avoid the frame scanning and reference-count
- * updates that occur when the type count falls to zero.
- */
-#define PG_guest_pinned     (1<<29)
+ /* The following page types are MUTUALLY EXCLUSIVE. */
+#define PGT_none            (0<<29) /* no special uses of this page */
+#define PGT_l1_page_table   (1<<29) /* using this page as an L1 page table? */
+#define PGT_l2_page_table   (2<<29) /* using this page as an L2 page table? */
+#define PGT_l3_page_table   (3<<29) /* using this page as an L3 page table? */
+#define PGT_l4_page_table   (4<<29) /* using this page as an L4 page table? */
+#define PGT_gdt_page        (5<<29) /* using this page in a GDT? */
+#define PGT_ldt_page        (6<<29) /* using this page in an LDT? */
+#define PGT_writeable_page  (7<<29) /* has writable mappings of this page? */
+#define PGT_type_mask       (7<<29) /* Bits 29-31. */
+ /* Has this page been validated for use as its current type? */
+#define _PGT_validated      28
+#define PGT_validated       (1<<_PGT_validated)
+ /* 28-bit count of uses of this frame as its current type. */
+#define PGT_count_mask      ((1<<28)-1)
  
-#define PageSlab(page)         test_bit(PG_slab, &(page)->flags)
-#define PageSetSlab(page)      set_bit(PG_slab, &(page)->flags)
-#define PageClearSlab(page)    clear_bit(PG_slab, &(page)->flags)
+ /* The owner of this page is dead: 'u.domain' is no longer valid. */
+#define _PGC_zombie                   31
+#define PGC_zombie                    (1<<_PGC_zombie)
+ /* For safety, force a TLB flush when this page's type changes. */
+#define _PGC_tlb_flush_on_type_change 30
+#define PGC_tlb_flush_on_type_change  (1<<_PGC_tlb_flush_on_type_change)
+ /* Owning guest has pinned this page to its current type? */
+#define _PGC_guest_pinned             29
+#define PGC_guest_pinned              (1<<_PGC_guest_pinned)
+ /* Cleared when the owning guest 'frees' this page. */
+#define _PGC_allocated                28
+#define PGC_allocated                 (1<<_PGC_allocated)
+ /* 28-bit count of references to this frame. */
+#define PGC_count_mask                ((1<<28)-1)
  
-#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                            \
-    do {                                                             \
-        (_pfn)->flags = (_dom) | PGT_writeable_page | PG_need_flush; \
-        set_page_tot_count((_pfn), 2);                               \
-        set_page_type_count((_pfn), 2);                              \
-    } while ( 0 )
+/* We trust the slab allocator in slab.c, and our use of it. */
+#define PageSlab(page)         (1)
+#define PageSetSlab(page)      ((void)0)
+#define PageClearSlab(page)    ((void)0)
+
+#define IS_XEN_HEAP_FRAME(_pfn) (page_to_phys(_pfn) < MAX_MONITOR_ADDRESS)
  
-#define UNSHARE_PFN(_pfn)                                            \
-    do {                                                             \
-        (_pfn)->flags = 0;                                           \
-        set_page_tot_count((_pfn), 0);                               \
-        set_page_type_count((_pfn), 0);                              \
+#define SHARE_PFN_WITH_DOMAIN(_pfn, _dom)                                  \
+    do {                                                                   \
+        (_pfn)->u.domain = (_dom);                                         \
+        wmb(); /* install valid domain ptr before updating refcnt. */      \
+        (_pfn)->count_and_flags = 1; /* Xen holds a writeable reference */ \
+        (_pfn)->type_and_flags  = PGT_writeable_page | PGT_validated | 1;  \
      } while ( 0 )
  
-/* The array of struct pfn_info,  
- * free pfn list and number of free pfns in the free list
- */
-extern frame_table_t * frame_table;
+#define UNSHARE_PFN(_pfn) put_page_and_type(_pfn)
+
+extern struct pfn_info *frame_table;
  extern unsigned long frame_table_size;
  extern struct list_head free_list;
  extern spinlock_t free_list_lock;
@@ -140,6 +126,180 @@ extern unsigned int free_pfns;
  extern unsigned long max_page;
  void init_frametable(unsigned long nr_pages);
  
+struct pfn_info *alloc_domain_page(struct task_struct *p);
+void free_domain_page(struct pfn_info *page);
+
+int alloc_page_type(struct pfn_info *page, unsigned int type);
+void free_page_type(struct pfn_info *page, unsigned int type);
+
+static inline void put_page(struct pfn_info *page)
+{
+    unsigned long nx, x, y = page->count_and_flags;
+
+    do {
+        x  = y;
+        nx = x - 1;
+    }
+    while ( unlikely((y = cmpxchg(&page->count_and_flags, x, nx)) != x) );
+
+    if ( unlikely((nx & PGC_count_mask) == 0) )
+        free_domain_page(page);
+}
+
+
+static inline int get_page(struct pfn_info *page,
+                           struct task_struct *domain)
+{
+    unsigned long x, nx, y = page->count_and_flags;
+    struct task_struct *p, *np = page->u.domain;
+
+    do {
+        x  = y;
+        nx = x + 1;
+        p  = np;
+        if ( unlikely((x & PGC_count_mask) == 0) ||  /* Not allocated? */
+             unlikely((nx & PGC_count_mask) == 0) || /* Count overflow? */
+             unlikely(x & PGC_zombie) ||             /* Zombie? */
+             unlikely(p != domain) )                 /* Wrong owner? */
+        {
+            DPRINTK("Error pfn %08lx: ed=%p,sd=%p,caf=%08lx\n",
+                    page_to_pfn(page), domain, p, x);
+            return 0;
+        }
+        __asm__ __volatile__(
+            LOCK_PREFIX "cmpxchg8b %3"
+            : "=a" (np), "=d" (y), "=b" (p),
+              "=m" (*(volatile unsigned long long *)(&page->u.domain))
+            : "0" (p), "1" (x), "b" (p), "c" (nx) );
+    }
+    while ( unlikely(np != p) || unlikely(y != x) );
+
+    return 1;
+}
+
+
+static inline void put_page_type(struct pfn_info *page)
+{
+    unsigned long nx, x, y = page->type_and_flags;
+
+ again:
+    do {
+        x  = y;
+        nx = x - 1;
+        if ( unlikely((nx & PGT_count_mask) == 0) )
+        {
+            page->tlbflush_timestamp = tlbflush_clock;
+            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
+                 likely(nx & PGT_validated) )
+            {
+                /*
+                 * Page-table pages must be unvalidated when count is zero. The
+                 * 'free' is safe because the refcnt is non-zero and the
+                 * validated bit is clear => other ops will spin or fail.
+                 */
+                if ( unlikely((y = cmpxchg(&page->type_and_flags, x, 
+                                           x & ~PGT_validated)) != x) )
+                    goto again;
+                /* We cleared the 'valid bit' so we must do the clear up. */
+                free_page_type(page, x & PGT_type_mask);
+                /* Carry on as we were, but with the 'valid bit' now clear. */
+                x  &= ~PGT_validated;
+                nx &= ~PGT_validated;
+            }
+        }
+    }
+    while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) );
+}
+
+
+static inline int get_page_type(struct pfn_info *page, unsigned long type)
+{
+    unsigned long nx, x, y = page->type_and_flags;
+ again:
+    do {
+        x  = y;
+        nx = x + 1;
+        if ( unlikely((nx & PGT_count_mask) == 0) )
+        {
+            DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
+            return 0;
+        }
+        else if ( unlikely((x & PGT_count_mask) == 0) )
+        {
+            if ( (x & PGT_type_mask) != type )
+            {
+                nx &= ~(PGT_type_mask | PGT_validated);
+                nx |= type;
+                /* No extra validation needed for writeable pages. */
+                if ( type == PGT_writeable_page )
+                    nx |= PGT_validated;
+            }
+        }
+        else if ( unlikely((x & PGT_type_mask) != type) )
+        {
+            DPRINTK("Unexpected type (saw %08lx != exp %08lx) for pfn %08lx\n",
+                    x & PGT_type_mask, type, page_to_pfn(page));
+            return 0;
+        }
+        else if ( unlikely(!(x & PGT_validated)) )
+        {
+            /* Someone else is updating validation of this page. Wait... */
+            while ( (y = page->type_and_flags) != x )
+            {
+                rep_nop();
+                barrier();
+            }
+            goto again;
+        }
+    }
+    while ( unlikely((y = cmpxchg(&page->type_and_flags, x, nx)) != x) );
+
+    if ( unlikely(!(nx & PGT_validated)) )
+    {
+        /* Try to validate page type; drop the new reference on failure. */
+        if ( unlikely(!alloc_page_type(page, type)) )
+        {
+            DPRINTK("Error while validating pfn %08lx for type %08lx\n",
+                    page_to_pfn(page), type);
+            put_page_type(page);
+            return 0;
+        }
+        set_bit(_PGT_validated, &page->type_and_flags);
+    }
+
+    return 1;
+}
+
+
+static inline void put_page_and_type(struct pfn_info *page)
+{
+    put_page_type(page);
+    put_page(page);
+}
+
+
+static inline int get_page_and_type(struct pfn_info *page,
+                                    struct task_struct *domain,
+                                    unsigned int type)
+{
+    int rc = get_page(page, domain);
+
+    if ( likely(rc) && unlikely(!get_page_type(page, type)) )
+    {
+        put_page(page);
+        rc = 0;
+    }
+
+    return rc;
+}
+
+#define ASSERT_PAGE_IS_TYPE(_p, _t)                \
+    ASSERT(((_p)->type_and_flags & PGT_type_mask) == (_t));  \
+    ASSERT(((_p)->type_and_flags & PGT_count_mask) != 0)
+#define ASSERT_PAGE_IS_DOMAIN(_p, _d)              \
+    ASSERT(((_p)->count_and_flags & PGC_count_mask) != 0);  \
+    ASSERT((_p)->u.domain == (_d))
+
  int check_descriptor(unsigned long a, unsigned long b);
  
  /*
diff --git a/xen/include/xeno/perfc.h b/xen/include/xeno/perfc.h

index 4048000790e66afdb9d3f525c36d167aff2e6f66..9ea244b3b84f91165a286928fa39ac30160ffb7d 100644 (file)
--- a/xen/include/xeno/perfc.h
+++ b/xen/include/xeno/perfc.h
@@ -1,6 +1,6 @@
-/*
- * xen performance counters
- */
+
+#ifndef __XENO_PERFC_H__
+#define __XENO_PERFC_H__
  
  #include <asm/atomic.h>
  
@@ -53,3 +53,4 @@ extern struct perfcounter_t perfcounters;
  #define perfc_addc(x,y)   atomic_add((y), &perfcounters.x[smp_processor_id()])
  #define perfc_adda(x,y,z) atomic_add((z), &perfcounters.x[y])
  
+#endif /* __XENO_PERFC_H__ */
diff --git a/xen/include/xeno/perfc_defn.h b/xen/include/xeno/perfc_defn.h

index 033f12c8c90a3ff2072af44a681e3bca5e097ad6..f81b5bcba19d770e82189674521e0a1866e29086 100644 (file)
--- a/xen/include/xeno/perfc_defn.h
+++ b/xen/include/xeno/perfc_defn.h
@@ -12,7 +12,6 @@ PERFCOUNTER( net_hypercalls, "network hypercalls" )
  PERFCOUNTER( net_rx_congestion_drop, "net rx congestion drops" )
  PERFCOUNTER( net_rx_capacity_drop, "net rx capacity drops" )
  PERFCOUNTER( net_rx_delivered, "net rx delivered" )
-PERFCOUNTER( net_rx_tlbflush, "net rx tlb flushes" )
  PERFCOUNTER( net_tx_transmitted, "net tx transmitted" )
  
  PERFCOUNTER_CPU( domain_page_tlb_flush, "domain page tlb flushes" )
diff --git a/xen/include/xeno/sched.h b/xen/include/xeno/sched.h

index 736201446a5227f73801cc73cd3e07ca1e905267..6c1984d7957eca5bf33a8500ee8c78c03dda4510 100644 (file)
--- a/xen/include/xeno/sched.h
+++ b/xen/include/xeno/sched.h
@@ -4,7 +4,6 @@
  #include <xeno/config.h>
  #include <xeno/types.h>
  #include <xeno/spinlock.h>
-#include <asm/page.h>
  #include <asm/ptrace.h>
  #include <xeno/smp.h>
  #include <asm/processor.h>
@@ -16,7 +15,6 @@
  #include <xeno/time.h>
  #include <xeno/ac_timer.h>
  #include <xeno/delay.h>
-#include <xeno/slab.h>
  
  #define MAX_DOMAIN_NAME 16
  
@@ -94,9 +92,10 @@ struct task_struct
      
      unsigned int domain;        /* domain id */
  
-    struct list_head pg_head;
-    unsigned int tot_pages;     /* number of pages currently possesed */
-    unsigned int max_pages;     /* max number of pages that can be possesed */
+    spinlock_t       page_list_lock;
+    struct list_head page_list;
+    unsigned int     tot_pages; /* number of pages currently possesed */
+    unsigned int     max_pages; /* max number of pages that can be possesed */
  
      /* scheduling */
      struct list_head run_list;
@@ -132,8 +131,6 @@ struct task_struct
  
      /* VM */
      struct mm_struct mm;
-    /* We need this lock to check page types and frob reference counts. */
-    spinlock_t page_lock;
  
      mm_segment_t addr_limit;
  
@@ -194,6 +191,8 @@ extern struct task_struct *idle_task[NR_CPUS];
  
  #define STACK_SIZE PAGE_SIZE
  
+#include <xeno/slab.h>
+
  extern kmem_cache_t *task_struct_cachep;
  #define alloc_task_struct()  \
    ((struct task_struct *)kmem_cache_alloc(task_struct_cachep,GFP_KERNEL))
diff --git a/xen/include/xeno/vif.h b/xen/include/xeno/vif.h

index f3ee9fa61635ac322e65bc1a956121fdf9fd967d..a557cb3802ce0a87bcd5851b09ea096c137ea47b 100644 (file)
--- a/xen/include/xeno/vif.h
+++ b/xen/include/xeno/vif.h
@@ -34,7 +34,7 @@ extern struct net_device *the_dev;
  typedef struct rx_shadow_entry_st 
  {
      unsigned short id;
-    unsigned short flush_count; /* 16 bits should be enough */
+    unsigned short _pad;
      unsigned long  pte_ptr;
      unsigned long  buf_pfn;
  } rx_shadow_entry_t;
diff --git a/xen/net/dev.c b/xen/net/dev.c

index 280db4def1e91f97e3074182322ae6ad7418d247..91d6a4e0cf6a525040885d13f5fba8f919565be9 100644 (file)
--- a/xen/net/dev.c
+++ b/xen/net/dev.c
@@ -39,12 +39,6 @@
  #define rtnl_lock() ((void)0)
  #define rtnl_unlock() ((void)0)
  
-#if 0
-#define DPRINTK(_f, _a...) printk(_f , ## _a)
-#else 
-#define DPRINTK(_f, _a...) ((void)0)
-#endif
-
  #define TX_RING_INC(_i)    (((_i)+1) & (TX_RING_SIZE-1))
  #define RX_RING_INC(_i)    (((_i)+1) & (RX_RING_SIZE-1))
  #define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
@@ -54,9 +48,9 @@ struct skb_completion_queues skb_queue[NR_CPUS] __cacheline_aligned;
  
  static int get_tx_bufs(net_vif_t *vif);
  
-static void __make_tx_response(net_vif_t *vif, 
-                               unsigned short id, 
-                               unsigned char  st);
+static void make_tx_response(net_vif_t     *vif, 
+                             unsigned short id, 
+                             unsigned char  st);
  static void make_rx_response(net_vif_t     *vif, 
                               unsigned short id, 
                               unsigned short size,
@@ -499,89 +493,69 @@ struct netif_rx_stats netdev_rx_stat[NR_CPUS];
  void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
  {
      rx_shadow_entry_t *rx;
-    unsigned long *ptep; 
+    unsigned long *ptep, pte; 
      struct pfn_info *old_page, *new_page, *pte_page;
      unsigned int i; 
      unsigned short size;
      unsigned char  offset, status = RING_STATUS_OK;
+    struct task_struct *p = vif->domain;
  
      memcpy(skb->mac.ethernet->h_dest, vif->vmac, ETH_ALEN);
      if ( ntohs(skb->mac.ethernet->h_proto) == ETH_P_ARP )
          memcpy(skb->nh.raw + 18, vif->vmac, ETH_ALEN);
  
-    /*
-     * Slightly gross: we need the page_lock so that we can do PTE checking.
-     * However, we take it slightly early so that it can protect the update
-     * of rx_cons. This saves us from grabbing two locks.
-     */
-    spin_lock(&vif->domain->page_lock);
+    spin_lock(&vif->rx_lock);
  
      if ( (i = vif->rx_cons) == vif->rx_prod )
      {
-        spin_unlock(&vif->domain->page_lock);
+        spin_unlock(&vif->rx_lock);
          perfc_incr(net_rx_capacity_drop);
          return;
      }
-    rx = vif->rx_shadow_ring + i;
+    rx = &vif->rx_shadow_ring[i];
      vif->rx_cons = RX_RING_INC(i);
  
      size   = (unsigned short)skb->len;
      offset = (unsigned char)((unsigned long)skb->data & ~PAGE_MASK);
  
-    /* Release the page-table page. */
-    pte_page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
-    put_page_type(pte_page);
-    put_page_tot(pte_page);
-
-    old_page = frame_table + rx->buf_pfn;
+    pte_page = &frame_table[rx->pte_ptr >> PAGE_SHIFT];
+    old_page = &frame_table[rx->buf_pfn];
      new_page = skb->pf;
      
      ptep = map_domain_mem(rx->pte_ptr);
  
-    if ( (*ptep & _PAGE_PRESENT) )
+    new_page->u.domain = p;
+    wmb(); /* make dom ptr visible before updating refcnt. */
+    spin_lock(&p->page_list_lock);
+    list_add(&new_page->list, &p->page_list);
+    new_page->count_and_flags = PGC_allocated | 2;
+    spin_unlock(&p->page_list_lock);
+    get_page_type(new_page, PGT_writeable_page);
+    set_bit(_PGC_tlb_flush_on_type_change, &new_page->count_and_flags);
+    wmb(); /* Get type count and set flush bit before updating PTE. */
+
+    pte = *ptep;
+    if ( unlikely(pte & _PAGE_PRESENT) || 
+         unlikely(cmpxchg(ptep, pte, 
+                          (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
+                          ((new_page - frame_table) << PAGE_SHIFT))) != pte )
      {
-        /* Bail out if the PTE has been reused under our feet. */
-        list_add(&old_page->list, &vif->domain->pg_head);
-        old_page->flags = vif->domain->domain;
          unmap_domain_mem(ptep);
-        spin_unlock(&vif->domain->page_lock);
          status = RING_STATUS_BAD_PAGE;
          goto out;
      }
  
-    /* Give the new page to the domain, marking it writeable. */
-    set_page_type_count(new_page, 1);
-    set_page_tot_count(new_page, 1);
-    new_page->flags = vif->domain->domain | PGT_writeable_page | PG_need_flush;
-    list_add(&new_page->list, &vif->domain->pg_head);
-    
-    /* Patch the PTE to map the new page as writeable. */
      machine_to_phys_mapping[new_page - frame_table] 
-        = machine_to_phys_mapping[old_page - frame_table];        
-    *ptep = (*ptep & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT |
-        (((new_page - frame_table) << PAGE_SHIFT) & PAGE_MASK);
+        = machine_to_phys_mapping[old_page - frame_table];
      
      unmap_domain_mem(ptep);
  
-    spin_unlock(&vif->domain->page_lock);
-    
      /* Our skbuff now points at the guest's old frame. */
      skb->pf = old_page;
  
      /* Updates must happen before releasing the descriptor. */
      smp_wmb();
  
-    /*
-     * NB. The remote flush here should be safe, as we hold no locks. The 
-     * network driver that called us should also have no nasty locks.
-     */
-    if ( rx->flush_count == (unsigned short)
-         atomic_read(&tlb_flush_count[vif->domain->processor]) )
-    {
-        perfc_incr(net_rx_tlbflush);
-        flush_tlb_cpu(vif->domain->processor);
-    }
-
      perfc_incr(net_rx_delivered);
  
      /* record this so they can be billed */
@@ -589,7 +563,9 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
      vif->total_bytes_received += size;
  
   out:
+    put_page_and_type(pte_page);
      make_rx_response(vif, rx->id, size, status, offset);
+    spin_unlock(&vif->rx_lock);
  }
  
  /**
@@ -785,8 +761,8 @@ static void net_tx_action(unsigned long unused)
          skb->mac.raw  = skb->data; 
          skb->guest_id = tx->id;
          
-        skb_shinfo(skb)->frags[0].page        = frame_table +
-            (tx->payload >> PAGE_SHIFT);
+        skb_shinfo(skb)->frags[0].page        = 
+            &frame_table[tx->payload >> PAGE_SHIFT];
          skb_shinfo(skb)->frags[0].size        = tx->size - PKT_PROT_LEN;
          skb_shinfo(skb)->frags[0].page_offset = tx->payload & ~PAGE_MASK;
          skb_shinfo(skb)->nr_frags = 1;
@@ -856,10 +832,8 @@ static void tx_skb_release(struct sk_buff *skb)
  
      vif = skb->src_vif;
      
-    spin_lock(&vif->domain->page_lock);
      for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
-        put_page_tot(skb_shinfo(skb)->frags[i].page);
-    spin_unlock(&vif->domain->page_lock);
+        put_page(skb_shinfo(skb)->frags[i].page);
      
      if ( skb->skb_type == SKB_NODATA )
          kmem_cache_free(net_header_cachep, skb->head);
@@ -867,7 +841,7 @@ static void tx_skb_release(struct sk_buff *skb)
      skb_shinfo(skb)->nr_frags = 0; 
      
      spin_lock(&vif->tx_lock);
-    __make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
+    make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
      spin_unlock(&vif->tx_lock);
      
      /*
@@ -1904,7 +1878,7 @@ static int get_tx_bufs(net_vif_t *vif)
          if ( (tx.size <= PKT_PROT_LEN) || (tx.size > ETH_FRAME_LEN) )
          {
              DPRINTK("Bad packet size: %d\n", tx.size);
-            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
              continue; 
          }
  
@@ -1932,23 +1906,21 @@ static int get_tx_bufs(net_vif_t *vif)
          vif->remaining_credit -= tx.size;
  
          /* No crossing a page boundary as the payload mustn't fragment. */
-        if ( ((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE ) 
+        if ( unlikely(((tx.addr & ~PAGE_MASK) + tx.size) >= PAGE_SIZE) ) 
          {
              DPRINTK("tx.addr: %lx, size: %u, end: %lu\n", 
                      tx.addr, tx.size, (tx.addr &~PAGE_MASK) + tx.size);
-            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
              continue;
          }
  
          buf_pfn  = tx.addr >> PAGE_SHIFT;
          buf_page = frame_table + buf_pfn;
-        spin_lock(&p->page_lock);
-        if ( (buf_pfn >= max_page) || 
-             ((buf_page->flags & PG_domain_mask) != p->domain) ) 
+        if ( unlikely(buf_pfn >= max_page) || 
+             unlikely(!get_page(buf_page, p)) )
          {
              DPRINTK("Bad page frame\n");
-            spin_unlock(&p->page_lock);
-            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
              continue;
          }
              
@@ -1958,8 +1930,8 @@ static int get_tx_bufs(net_vif_t *vif)
              init_tx_header(vif, g_data, tx.size, the_dev));
          if ( protocol == 0 )
          {
-            __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
-            goto tx_unmap_and_continue;
+            make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+            goto cleanup_and_continue;
          }
  
          target = net_get_target_vif(g_data, tx.size, vif);
@@ -1969,9 +1941,9 @@ static int get_tx_bufs(net_vif_t *vif)
              /* Local delivery */
              if ( (skb = dev_alloc_skb(ETH_FRAME_LEN + 32)) == NULL )
              {
-                __make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
+                make_tx_response(vif, tx.id, RING_STATUS_BAD_PAGE);
                  put_vif(target);
-                goto tx_unmap_and_continue;
+                goto cleanup_and_continue;
              }
  
              skb->src_vif = vif;
@@ -1995,7 +1967,7 @@ static int get_tx_bufs(net_vif_t *vif)
              if ( netif_rx(skb) == NET_RX_DROP )
                  kfree_skb(skb);
  
-            __make_tx_response(vif, tx.id, RING_STATUS_OK);
+            make_tx_response(vif, tx.id, RING_STATUS_OK);
          }
          else if ( (target == VIF_PHYS) || IS_PRIV(p) )
          {
@@ -2005,23 +1977,24 @@ static int get_tx_bufs(net_vif_t *vif)
                  kmem_cache_alloc(net_header_cachep, GFP_KERNEL);
              if ( vif->tx_shadow_ring[j].header == NULL )
              { 
-                __make_tx_response(vif, tx.id, RING_STATUS_OK);
-                goto tx_unmap_and_continue;
+                make_tx_response(vif, tx.id, RING_STATUS_OK);
+                goto cleanup_and_continue;
              }
  
              memcpy(vif->tx_shadow_ring[j].header, g_data, PKT_PROT_LEN);
              vif->tx_shadow_ring[j].payload = tx.addr + PKT_PROT_LEN;
-            get_page_tot(buf_page);
+            buf_page = NULL; /* hand off our page reference */
              j = TX_RING_INC(j);
          }
          else
          {
-            __make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
+            make_tx_response(vif, tx.id, RING_STATUS_DROPPED);
          }
  
-    tx_unmap_and_continue:
+    cleanup_and_continue:
+        if ( buf_page != NULL )
+            put_page(buf_page);
          unmap_domain_mem(g_data);
-        spin_unlock(&p->page_lock);
      }
  
      /*
@@ -2044,33 +2017,18 @@ static int get_tx_bufs(net_vif_t *vif)
  }
  
  
-static long get_bufs_from_vif(net_vif_t *vif)
+static void get_rx_bufs(net_vif_t *vif)
  {
-    net_ring_t *shared_rings;
-    net_idx_t *shared_idxs;
+    struct task_struct *p = vif->domain;
+    net_ring_t *shared_rings = vif->shared_rings;
+    net_idx_t *shared_idxs = vif->shared_idxs;
      unsigned int i, j;
      rx_req_entry_t rx;
      unsigned long  pte_pfn, buf_pfn;
      struct pfn_info *pte_page, *buf_page;
-    struct task_struct *p = vif->domain;
-    unsigned long *ptep;    
-
-    shared_idxs  = vif->shared_idxs;
-    shared_rings = vif->shared_rings;
-        
-    /*
-     * PHASE 1 -- TRANSMIT RING
-     */
-
-    if ( get_tx_bufs(vif) )
-    {
-        add_to_net_schedule_list_tail(vif);
-        maybe_schedule_tx_action();
-    }
+    unsigned long *ptep, pte;
  
-    /*
-     * PHASE 2 -- RECEIVE RING
-     */
+    spin_lock(&vif->rx_lock);
  
      /*
       * Collect up new receive buffers. We collect up to the guest OS's new
@@ -2085,66 +2043,83 @@ static long get_bufs_from_vif(net_vif_t *vif)
      {
          rx = shared_rings->rx_ring[i].req;
  
-        pte_pfn = rx.addr >> PAGE_SHIFT;
-        pte_page = frame_table + pte_pfn;
+        pte_pfn  = rx.addr >> PAGE_SHIFT;
+        pte_page = &frame_table[pte_pfn];
              
-        spin_lock(&p->page_lock);
-        if ( (pte_pfn >= max_page) || 
-             ((pte_page->flags & (PG_type_mask | PG_domain_mask)) != 
-              (PGT_l1_page_table | p->domain)) ) 
+        /* The address passed down must be to a valid PTE. */
+        if ( unlikely(pte_pfn >= max_page) ||
+             unlikely(!get_page_and_type(pte_page, p, PGT_l1_page_table)) )
          {
              DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
-                    p->domain, pte_pfn, max_page, pte_page->flags);
-            spin_unlock(&p->page_lock);
+                    p->domain, pte_pfn, max_page, pte_page->type_and_flags);
              make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
              continue;
          }
-            
+        
          ptep = map_domain_mem(rx.addr);
-            
-        if ( !(*ptep & _PAGE_PRESENT) )
+        pte  = *ptep;
+        
+        /* We must be passed a valid writeable mapping to swizzle. */
+        if ( unlikely((pte & (_PAGE_PRESENT|_PAGE_RW)) != 
+                      (_PAGE_PRESENT|_PAGE_RW)) ||
+             unlikely(cmpxchg(ptep, pte, pte & ~_PAGE_PRESENT) != pte) )
          {
-            DPRINTK("Invalid PTE passed down (not present)\n");
+            DPRINTK("Invalid PTE passed down (not present or changing)\n");
+            put_page_and_type(pte_page);
              make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
              goto rx_unmap_and_continue;
          }
-            
-        buf_pfn  = *ptep >> PAGE_SHIFT;
-        buf_page = frame_table + buf_pfn;
+        
+        buf_pfn  = pte >> PAGE_SHIFT;
+        buf_page = &frame_table[buf_pfn];
  
-        if ( ((buf_page->flags & (PG_type_mask | PG_domain_mask)) !=
-              (PGT_writeable_page | p->domain)) || 
-             (page_tot_count(buf_page) != 1) )
+        /*
+         * The page must belong to the correct domain, and must be mapped
+         * just once as a writeable page.
+         */
+        if ( unlikely(buf_page->u.domain != p) ||
+             unlikely(!test_and_clear_bit(_PGC_allocated, 
+                                          &buf_page->count_and_flags)) ||
+             unlikely(cmpxchg(&buf_page->type_and_flags, 
+                              PGT_writeable_page|PGT_validated|1,
+                              0) != (PGT_writeable_page|PGT_validated|1)) )
          {
-            DPRINTK("Need a mapped-once writeable page (%ld/%ld/%08lx)\n",
-                    page_type_count(buf_page), page_tot_count(buf_page), 
-                    buf_page->flags);
+            DPRINTK("Bad domain or page mapped writeable more than once.\n");
+            if ( buf_page->u.domain == p )
+                set_bit(_PGC_allocated, &buf_page->count_and_flags);
+            if ( unlikely(cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) !=
+                          (pte & ~_PAGE_PRESENT)) )
+                put_page_and_type(buf_page);
              make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
              goto rx_unmap_and_continue;
          }
-            
+
          /*
-         * The pte they passed was good, so take it away from them. We also
-         * lock down the page-table page, so it doesn't go away.
+         * Now ensure that we can take the last references to this page.
+         * The final count should be 2, because of PGC_allocated.
           */
-        get_page_type(pte_page);
-        get_page_tot(pte_page);
-        *ptep &= ~_PAGE_PRESENT;
-        buf_page->flags = 0;
-        set_page_type_count(buf_page, 0);
-        set_page_tot_count(buf_page, 0);
+        if ( unlikely(cmpxchg(&buf_page->count_and_flags, 
+                              PGC_tlb_flush_on_type_change | 2, 0) != 
+                      (PGC_tlb_flush_on_type_change | 2)) )
+        {
+            DPRINTK("Page held more than once\n");
+            /* Leave the page unmapped at 'ptep'. Stoopid domain! */
+            make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
+            goto rx_unmap_and_continue;
+        }
+            
+        /* Remove from the domain's allocation list. */
+        spin_lock(&p->page_list_lock);
          list_del(&buf_page->list);
+        spin_unlock(&p->page_list_lock);
  
-        vif->rx_shadow_ring[j].id          = rx.id;
-        vif->rx_shadow_ring[j].pte_ptr     = rx.addr;
-        vif->rx_shadow_ring[j].buf_pfn     = buf_pfn;
-        vif->rx_shadow_ring[j].flush_count = (unsigned short) 
-            atomic_read(&tlb_flush_count[smp_processor_id()]);
+        vif->rx_shadow_ring[j].id      = rx.id;
+        vif->rx_shadow_ring[j].pte_ptr = rx.addr;
+        vif->rx_shadow_ring[j].buf_pfn = buf_pfn;
          j = RX_RING_INC(j);
              
      rx_unmap_and_continue:
          unmap_domain_mem(ptep);
-        spin_unlock(&p->page_lock);
      }
  
      vif->rx_req_cons = i;
@@ -2155,6 +2130,20 @@ static long get_bufs_from_vif(net_vif_t *vif)
          vif->rx_prod = j;
      }
  
+    spin_unlock(&vif->rx_lock);
+}
+
+
+static long get_bufs_from_vif(net_vif_t *vif)
+{
+    if ( get_tx_bufs(vif) )
+    {
+        add_to_net_schedule_list_tail(vif);
+        maybe_schedule_tx_action();
+    }
+
+    get_rx_bufs(vif);
+
      return 0;
  }
  
@@ -2162,7 +2151,7 @@ static long get_bufs_from_vif(net_vif_t *vif)
  long flush_bufs_for_vif(net_vif_t *vif)
  {
      int i;
-    unsigned long *pte;
+    unsigned long *ptep, pte;
      struct pfn_info *page;
      struct task_struct *p = vif->domain;
      rx_shadow_entry_t *rx;
@@ -2170,7 +2159,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
      net_idx_t *shared_idxs = vif->shared_idxs;
  
      /* Return any outstanding receive buffers to the guest OS. */
-    spin_lock(&p->page_lock);
+    spin_lock(&vif->rx_lock);
      for ( i = vif->rx_req_cons; 
            (i != shared_idxs->rx_req_prod) && 
                (((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1); 
@@ -2184,32 +2173,32 @@ long flush_bufs_for_vif(net_vif_t *vif)
      {
          rx = &vif->rx_shadow_ring[i];
  
-        /* Release the page-table page. */
-        page = frame_table + (rx->pte_ptr >> PAGE_SHIFT);
-        put_page_type(page);
-        put_page_tot(page);
-
          /* Give the buffer page back to the domain. */
-        page = frame_table + rx->buf_pfn;
-        list_add(&page->list, &p->pg_head);
-        page->flags = vif->domain->domain;
+        page = &frame_table[rx->buf_pfn];
+        spin_lock(&p->page_list_lock);
+        list_add(&page->list, &p->page_list);
+        page->count_and_flags = PGC_allocated | 2;
+        spin_unlock(&p->page_list_lock);
+        get_page_type(page, PGT_writeable_page);
+        set_bit(_PGC_tlb_flush_on_type_change, &page->count_and_flags);
+        wmb();
  
          /* Patch up the PTE if it hasn't changed under our feet. */
-        pte = map_domain_mem(rx->pte_ptr);
-        if ( !(*pte & _PAGE_PRESENT) )
-        {
-            *pte = (rx->buf_pfn<<PAGE_SHIFT) | (*pte & ~PAGE_MASK) | 
-                _PAGE_RW | _PAGE_PRESENT;
-            page->flags |= PGT_writeable_page | PG_need_flush;
-            set_page_type_count(page, 1);
-            set_page_tot_count(page, 1);
-        }
-        unmap_domain_mem(pte);
+        ptep = map_domain_mem(rx->pte_ptr);
+        pte  = *ptep;
+        if ( unlikely(pte & _PAGE_PRESENT) ||
+             unlikely(cmpxchg(ptep, pte, (rx->buf_pfn<<PAGE_SHIFT) | 
+                              (pte & ~PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT)
+                      != pte) )
+            put_page_and_type(page);
+        unmap_domain_mem(ptep);
+
+        put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]);
  
          make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0);
      }
      vif->rx_cons = i;
-    spin_unlock(&p->page_lock);
+    spin_unlock(&vif->rx_lock);
  
      /*
       * Flush pending transmit buffers. The guest may still have to wait for
@@ -2221,7 +2210,7 @@ long flush_bufs_for_vif(net_vif_t *vif)
                (((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1); 
            i = TX_RING_INC(i) )
      {
-        __make_tx_response(vif, shared_rings->tx_ring[i].req.id, 
+        make_tx_response(vif, shared_rings->tx_ring[i].req.id, 
                             RING_STATUS_DROPPED);
      }
      vif->tx_req_cons = i;
@@ -2296,9 +2285,9 @@ long do_net_io_op(netop_t *uop)
  }
  
  
-static void __make_tx_response(net_vif_t     *vif, 
-                               unsigned short id, 
-                               unsigned char  st)
+static void make_tx_response(net_vif_t     *vif, 
+                             unsigned short id, 
+                             unsigned char  st)
  {
      unsigned int pos;
      tx_resp_entry_t *resp;
@@ -2329,7 +2318,6 @@ static void make_rx_response(net_vif_t     *vif,
      rx_resp_entry_t *resp;
  
      /* Place on the response ring for the relevant domain. */ 
-    spin_lock(&vif->rx_lock);
      pos  = vif->rx_resp_prod;
      resp = &vif->shared_rings->rx_ring[pos].resp;
      resp->id     = id;
@@ -2344,7 +2332,6 @@ static void make_rx_response(net_vif_t     *vif,
          unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
          guest_event_notify(cpu_mask);    
      }
-    spin_unlock(&vif->rx_lock);
  }
  
  
diff --git a/xen/net/skbuff.c b/xen/net/skbuff.c

index d8950633b952153b5c4c3d867743e1f62fcf1482..5fcc044c5e71074f30ae050b0ad59f2369dc5b53 100644 (file)
--- a/xen/net/skbuff.c
+++ b/xen/net/skbuff.c
@@ -133,41 +133,20 @@ static __inline__ void skb_head_to_pool(struct sk_buff *skb)
  
  static inline u8 *alloc_skb_data_page(struct sk_buff *skb)
  {
-    struct list_head *list_ptr;
-    struct pfn_info  *pf;
-    unsigned long flags;
-        
-    spin_lock_irqsave(&free_list_lock, flags);
-
-    if (!free_pfns) return NULL;
-
-    list_ptr = free_list.next;
-    pf = list_entry(list_ptr, struct pfn_info, list);
-    pf->flags = 0;
-    list_del(&pf->list);
-    free_pfns--;
-
-    spin_unlock_irqrestore(&free_list_lock, flags);
-
+    struct pfn_info *pf;
+    if ( unlikely((pf = alloc_domain_page(NULL)) == NULL) )
+        return NULL;
      skb->pf = pf;
      return (u8 *)((pf - frame_table) << PAGE_SHIFT);
  }
  
  static inline void dealloc_skb_data_page(struct sk_buff *skb)
  {
-    struct pfn_info  *pf;
+    struct pfn_info *pf = skb->pf;
      unsigned long flags;
-
-    pf = skb->pf;
-
      spin_lock_irqsave(&free_list_lock, flags);
-        
-    pf->flags = 0;
-    set_page_type_count(pf, 0);
-    set_page_tot_count(pf, 0);
      list_add(&pf->list, &free_list);
      free_pfns++;
-
      spin_unlock_irqrestore(&free_list_lock, flags);
  
  }
diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c

index b4784ccc0234edc4ba11ada8fa5f0cb08c088230..6bc8baa47abf37b9774dd90f5711f31fd3a57f68 100644 (file)
--- a/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c
+++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c
@@ -40,7 +40,7 @@ static void DEBUG_allow_pt_reads(void)
          pte = update_debug_queue[i].ptep;
          if ( pte == NULL ) continue;
          update_debug_queue[i].ptep = NULL;
-        update.ptr = pte;
+        update.ptr = virt_to_machine(pte);
          update.val = update_debug_queue[i].pteval;
          HYPERVISOR_mmu_update(&update, 1);
      }
@@ -59,7 +59,7 @@ static void DEBUG_disallow_pt_read(unsigned long va)
      pgd = pgd_offset_k(va);
      pmd = pmd_offset(pgd, va);
      pte = pte_offset(pmd, va);
-    update.ptr = pte;
+    update.ptr = virt_to_machine(pte);
      pteval = *(unsigned long *)pte;
      update.val = pteval & ~_PAGE_PRESENT;
      HYPERVISOR_mmu_update(&update, 1);
@@ -95,7 +95,9 @@ void MULTICALL_flush_page_update_queue(void)
  #if MMU_UPDATE_DEBUG > 0
          DEBUG_allow_pt_reads();
  #endif
-        queue_multicall2(__HYPERVISOR_mmu_update, (unsigned long)update_queue, idx);
+        queue_multicall2(__HYPERVISOR_mmu_update, 
+                         (unsigned long)update_queue, 
+                         idx);
          idx = 0;
      }
      spin_unlock_irqrestore(&update_lock, flags);
@@ -134,7 +136,7 @@ void queue_l1_entry_update(pte_t *ptr, unsigned long val)
  #if MMU_UPDATE_DEBUG > 0
      DEBUG_disallow_pt_read((unsigned long)ptr);
  #endif
-    update_queue[idx].ptr = (unsigned long)ptr;
+    update_queue[idx].ptr = virt_to_machine(ptr);
      update_queue[idx].val = val;
      increment_index();
      spin_unlock_irqrestore(&update_lock, flags);
@@ -144,7 +146,7 @@ void queue_l2_entry_update(pmd_t *ptr, unsigned long val)
  {
      unsigned long flags;
      spin_lock_irqsave(&update_lock, flags);
-    update_queue[idx].ptr = (unsigned long)ptr;
+    update_queue[idx].ptr = virt_to_machine(ptr);
      update_queue[idx].val = val;
      increment_index();
      spin_unlock_irqrestore(&update_lock, flags);
diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c

index 883cd03b37dd90ba623644a954826ac18a606aff..b1f8019ef9e172de9a6d257ab8b543bfe26dbf20 100644 (file)
--- a/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c
+++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/init.c
@@ -113,13 +113,10 @@ static inline void set_pte_phys (unsigned long vaddr,
      }
      pte = pte_offset(pmd, vaddr);
  
-#if 0 /* Not in Xen, since this breaks clear_fixmap. */
-    if (pte_val(*pte))
-        pte_ERROR(*pte);
-#endif
-
-    /* We queue directly, avoiding hidden phys->machine translation. */
-    queue_l1_entry_update(pte, phys | pgprot_val(prot));
+    if ( pte_io(*pte) || (pgprot_val(prot) & _PAGE_IO) )
+        queue_unchecked_mmu_update(pte, phys | pgprot_val(prot));
+    else
+        queue_l1_entry_update(pte, phys | pgprot_val(prot));
  
      /*
       * It's enough to flush this one mapping.
@@ -137,8 +134,7 @@ void __set_fixmap(enum fixed_addresses idx, unsigned long phys,
          printk("Invalid __set_fixmap\n");
          return;
      }
-    set_pte_phys(address, phys, 
-                 __pgprot(pgprot_val(PAGE_KERNEL)|pgprot_val(flags)));
+    set_pte_phys(address, phys, flags);
  }
  
  void clear_fixmap(enum fixed_addresses idx)
diff --git a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c

index eac5c6a63c09983f48bf01a567356ac4ec5754e6..078fede1447323b964afe590254bf5f3bec47e06 100644 (file)
--- a/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c
+++ b/xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c
@@ -202,14 +202,15 @@ void __init *bt_ioremap(unsigned long machine_addr, unsigned long size)
           */
          nrpages = size >> PAGE_SHIFT;
          if (nrpages > NR_FIX_BTMAPS)
-                return NULL;
+            return NULL;
  
          /*
           * Ok, go for it..
           */
          idx = FIX_BTMAP_BEGIN;
          while (nrpages > 0) {
-                set_fixmap(idx, machine_addr);
+                __set_fixmap(idx, machine_addr, 
+                             __pgprot(__PAGE_KERNEL|_PAGE_IO));
                  machine_addr += PAGE_SIZE;
                  --idx;
                  --nrpages;
author	kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
	Sat, 20 Dec 2003 12:44:11 +0000 (12:44 +0000)
committer	kaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
	Sat, 20 Dec 2003 12:44:11 +0000 (12:44 +0000)
.rootkeys		patch \| blob \| history
tools/xc/lib/xc_linux_build.c		patch \| blob \| history
tools/xc/lib/xc_linux_restore.c		patch \| blob \| history
tools/xc/lib/xc_linux_save.c		patch \| blob \| history
xen/GUEST_CHANGES	[deleted file]	patch \| blob \| history
xen/TODO	[deleted file]	patch \| blob \| history
xen/arch/i386/Rules.mk		patch \| blob \| history
xen/arch/i386/apic.c		patch \| blob \| history
xen/arch/i386/entry.S		patch \| blob \| history
xen/arch/i386/flushtlb.c	[new file with mode: 0644]	patch \| blob
xen/arch/i386/io_apic.c		patch \| blob \| history
xen/arch/i386/ioremap.c		patch \| blob \| history
xen/arch/i386/irq.c		patch \| blob \| history
xen/arch/i386/mm.c		patch \| blob \| history
xen/arch/i386/pci-irq.c		patch \| blob \| history
xen/arch/i386/process.c		patch \| blob \| history
xen/arch/i386/smp.c		patch \| blob \| history
xen/arch/i386/smpboot.c		patch \| blob \| history
xen/arch/i386/traps.c		patch \| blob \| history
xen/common/dom0_ops.c		patch \| blob \| history
xen/common/dom_mem_ops.c		patch \| blob \| history
xen/common/domain.c		patch \| blob \| history
xen/common/kernel.c		patch \| blob \| history
xen/common/memory.c		patch \| blob \| history
xen/common/network.c		patch \| blob \| history
xen/common/page_alloc.c		patch \| blob \| history
xen/drivers/block/ll_rw_blk.c		patch \| blob \| history
xen/drivers/block/xen_block.c		patch \| blob \| history
xen/drivers/block/xen_vbd.c		patch \| blob \| history
xen/drivers/net/e1000/e1000_main.c		patch \| blob \| history
xen/include/asm-i386/atomic.h		patch \| blob \| history
xen/include/asm-i386/flushtlb.h		patch \| blob \| history
xen/include/asm-i386/io.h		patch \| blob \| history
xen/include/asm-i386/page.h		patch \| blob \| history
xen/include/asm-i386/pgalloc.h		patch \| blob \| history
xen/include/asm-i386/smp.h		patch \| blob \| history
xen/include/asm-i386/spinlock.h		patch \| blob \| history
xen/include/asm-i386/system.h		patch \| blob \| history
xen/include/hypervisor-ifs/dom0_ops.h		patch \| blob \| history
xen/include/hypervisor-ifs/hypervisor-if.h		patch \| blob \| history
xen/include/xeno/config.h		patch \| blob \| history
xen/include/xeno/mm.h		patch \| blob \| history
xen/include/xeno/perfc.h		patch \| blob \| history
xen/include/xeno/perfc_defn.h		patch \| blob \| history
xen/include/xeno/sched.h		patch \| blob \| history
xen/include/xeno/vif.h		patch \| blob \| history
xen/net/dev.c		patch \| blob \| history
xen/net/skbuff.c		patch \| blob \| history
xenolinux-2.4.23-sparse/arch/xeno/mm/hypervisor.c		patch \| blob \| history
xenolinux-2.4.23-sparse/arch/xeno/mm/init.c		patch \| blob \| history
xenolinux-2.4.23-sparse/arch/xeno/mm/ioremap.c		patch \| blob \| history